Remove text loading in basic walk through demo. (#7753)
This commit is contained in:
parent
c467e90ac1
commit
bcce17e688
@ -1,50 +1,65 @@
|
|||||||
"""
|
"""
|
||||||
Getting started with XGBoost
|
Getting started with XGBoost
|
||||||
============================
|
============================
|
||||||
|
|
||||||
|
This is a simple example of using the native XGBoost interface, there are other
|
||||||
|
interfaces in the Python package like scikit-learn interface and Dask interface.
|
||||||
|
|
||||||
|
|
||||||
|
See :doc:`/python/python_intro` and :doc:`/tutorials/index` for other references.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import scipy.sparse
|
|
||||||
import pickle
|
import pickle
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from sklearn.datasets import load_svmlight_file
|
||||||
|
|
||||||
# Make sure the demo knows where to load the data.
|
# Make sure the demo knows where to load the data.
|
||||||
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
XGBOOST_ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
|
XGBOOST_ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
|
||||||
DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, 'demo')
|
DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, "demo")
|
||||||
|
|
||||||
# simple example
|
# X is a scipy csr matrix, XGBoost supports many other input types,
|
||||||
# load file from text file, also binary buffer generated by xgboost
|
X, y = load_svmlight_file(os.path.join(DEMO_DIR, "data", "agaricus.txt.train"))
|
||||||
dtrain = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train?indexing_mode=1'))
|
dtrain = xgb.DMatrix(X, y)
|
||||||
dtest = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.test?indexing_mode=1'))
|
# validation set
|
||||||
|
X_test, y_test = load_svmlight_file(os.path.join(DEMO_DIR, "data", "agaricus.txt.test"))
|
||||||
|
dtest = xgb.DMatrix(X_test, y_test)
|
||||||
|
|
||||||
# specify parameters via map, definition are same as c++ version
|
# specify parameters via map, definition are same as c++ version
|
||||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
|
||||||
|
|
||||||
# specify validations set to watch performance
|
# specify validations set to watch performance
|
||||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||||
|
# number of boosting rounds
|
||||||
num_round = 2
|
num_round = 2
|
||||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
bst = xgb.train(param, dtrain, num_boost_round=num_round, evals=watchlist)
|
||||||
|
|
||||||
# this is prediction
|
# run prediction
|
||||||
preds = bst.predict(dtest)
|
preds = bst.predict(dtest)
|
||||||
labels = dtest.get_label()
|
labels = dtest.get_label()
|
||||||
print('error=%f' %
|
print(
|
||||||
(sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) /
|
"error=%f"
|
||||||
float(len(preds))))
|
% (
|
||||||
bst.save_model('0001.model')
|
sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
|
||||||
|
/ float(len(preds))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
bst.save_model("model-0.json")
|
||||||
# dump model
|
# dump model
|
||||||
bst.dump_model('dump.raw.txt')
|
bst.dump_model("dump.raw.txt")
|
||||||
# dump model with feature map
|
# dump model with feature map
|
||||||
bst.dump_model('dump.nice.txt', os.path.join(DEMO_DIR, 'data/featmap.txt'))
|
bst.dump_model("dump.nice.txt", os.path.join(DEMO_DIR, "data/featmap.txt"))
|
||||||
|
|
||||||
# save dmatrix into binary buffer
|
# save dmatrix into binary buffer
|
||||||
dtest.save_binary('dtest.buffer')
|
dtest.save_binary("dtest.dmatrix")
|
||||||
# save model
|
# save model
|
||||||
bst.save_model('xgb.model')
|
bst.save_model("model-1.json")
|
||||||
# load model and data in
|
# load model and data in
|
||||||
bst2 = xgb.Booster(model_file='xgb.model')
|
bst2 = xgb.Booster(model_file="model-1.json")
|
||||||
dtest2 = xgb.DMatrix('dtest.buffer')
|
dtest2 = xgb.DMatrix("dtest.dmatrix")
|
||||||
preds2 = bst2.predict(dtest2)
|
preds2 = bst2.predict(dtest2)
|
||||||
# assert they are the same
|
# assert they are the same
|
||||||
assert np.sum(np.abs(preds2 - preds)) == 0
|
assert np.sum(np.abs(preds2 - preds)) == 0
|
||||||
@ -56,40 +71,3 @@ bst3 = pickle.loads(pks)
|
|||||||
preds3 = bst3.predict(dtest2)
|
preds3 = bst3.predict(dtest2)
|
||||||
# assert they are the same
|
# assert they are the same
|
||||||
assert np.sum(np.abs(preds3 - preds)) == 0
|
assert np.sum(np.abs(preds3 - preds)) == 0
|
||||||
|
|
||||||
###
|
|
||||||
# build dmatrix from scipy.sparse
|
|
||||||
print('start running example of build DMatrix from scipy.sparse CSR Matrix')
|
|
||||||
labels = []
|
|
||||||
row = []
|
|
||||||
col = []
|
|
||||||
dat = []
|
|
||||||
i = 0
|
|
||||||
for l in open(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train')):
|
|
||||||
arr = l.split()
|
|
||||||
labels.append(int(arr[0]))
|
|
||||||
for it in arr[1:]:
|
|
||||||
k, v = it.split(':')
|
|
||||||
row.append(i)
|
|
||||||
col.append(int(k))
|
|
||||||
dat.append(float(v))
|
|
||||||
i += 1
|
|
||||||
csr = scipy.sparse.csr_matrix((dat, (row, col)))
|
|
||||||
dtrain = xgb.DMatrix(csr, label=labels)
|
|
||||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
|
||||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
|
||||||
|
|
||||||
print('start running example of build DMatrix from scipy.sparse CSC Matrix')
|
|
||||||
# we can also construct from csc matrix
|
|
||||||
csc = scipy.sparse.csc_matrix((dat, (row, col)))
|
|
||||||
dtrain = xgb.DMatrix(csc, label=labels)
|
|
||||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
|
||||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
|
||||||
|
|
||||||
print('start running example of build DMatrix from numpy array')
|
|
||||||
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix
|
|
||||||
# in internal implementation then convert to DMatrix
|
|
||||||
npymat = csr.todense()
|
|
||||||
dtrain = xgb.DMatrix(npymat, label=labels)
|
|
||||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
|
||||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
|
||||||
|
|||||||
@ -45,6 +45,7 @@ including:
|
|||||||
- XGBoost binary buffer file.
|
- XGBoost binary buffer file.
|
||||||
- LIBSVM text format file
|
- LIBSVM text format file
|
||||||
- Comma-separated values (CSV) file
|
- Comma-separated values (CSV) file
|
||||||
|
- Arrow table.
|
||||||
|
|
||||||
(See :doc:`/tutorials/input_format` for detailed description of text input format.)
|
(See :doc:`/tutorials/input_format` for detailed description of text input format.)
|
||||||
|
|
||||||
|
|||||||
@ -565,12 +565,14 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
|||||||
"""Parameters
|
"""Parameters
|
||||||
----------
|
----------
|
||||||
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
|
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
|
||||||
dt.Frame/cudf.DataFrame/cupy.array/dlpack
|
dt.Frame/cudf.DataFrame/cupy.array/dlpack/arrow.Table
|
||||||
|
|
||||||
Data source of DMatrix.
|
Data source of DMatrix.
|
||||||
When data is string or os.PathLike type, it represents the path
|
|
||||||
libsvm format txt file, csv file (by specifying uri parameter
|
When data is string or os.PathLike type, it represents the path libsvm
|
||||||
'path_to_csv?format=csv'), or binary file that xgboost can read
|
format txt file, csv file (by specifying uri parameter
|
||||||
from.
|
'path_to_csv?format=csv'), or binary file that xgboost can read from.
|
||||||
|
|
||||||
label : array_like
|
label : array_like
|
||||||
Label of the training data.
|
Label of the training data.
|
||||||
weight : array_like
|
weight : array_like
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user