Remove text loading in basic walk through demo. (#7753)

This commit is contained in:
Jiaming Yuan 2022-04-01 00:59:42 +08:00 committed by GitHub
parent c467e90ac1
commit bcce17e688
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 62 deletions

View File

@ -1,50 +1,65 @@
""" """
Getting started with XGBoost Getting started with XGBoost
============================ ============================
This is a simple example of using the native XGBoost interface, there are other
interfaces in the Python package like scikit-learn interface and Dask interface.
See :doc:`/python/python_intro` and :doc:`/tutorials/index` for other references.
""" """
import numpy as np import numpy as np
import scipy.sparse
import pickle import pickle
import xgboost as xgb import xgboost as xgb
import os import os
from sklearn.datasets import load_svmlight_file
# Make sure the demo knows where to load the data. # Make sure the demo knows where to load the data.
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
XGBOOST_ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR)) XGBOOST_ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, 'demo') DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, "demo")
# simple example # X is a scipy csr matrix, XGBoost supports many other input types,
# load file from text file, also binary buffer generated by xgboost X, y = load_svmlight_file(os.path.join(DEMO_DIR, "data", "agaricus.txt.train"))
dtrain = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train?indexing_mode=1')) dtrain = xgb.DMatrix(X, y)
dtest = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.test?indexing_mode=1')) # validation set
X_test, y_test = load_svmlight_file(os.path.join(DEMO_DIR, "data", "agaricus.txt.test"))
dtest = xgb.DMatrix(X_test, y_test)
# specify parameters via map, definition are same as c++ version # specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
# specify validations set to watch performance # specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')] watchlist = [(dtest, "eval"), (dtrain, "train")]
# number of boosting rounds
num_round = 2 num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist) bst = xgb.train(param, dtrain, num_boost_round=num_round, evals=watchlist)
# this is prediction # run prediction
preds = bst.predict(dtest) preds = bst.predict(dtest)
labels = dtest.get_label() labels = dtest.get_label()
print('error=%f' % print(
(sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / "error=%f"
float(len(preds)))) % (
bst.save_model('0001.model') sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
/ float(len(preds))
)
)
bst.save_model("model-0.json")
# dump model # dump model
bst.dump_model('dump.raw.txt') bst.dump_model("dump.raw.txt")
# dump model with feature map # dump model with feature map
bst.dump_model('dump.nice.txt', os.path.join(DEMO_DIR, 'data/featmap.txt')) bst.dump_model("dump.nice.txt", os.path.join(DEMO_DIR, "data/featmap.txt"))
# save dmatrix into binary buffer # save dmatrix into binary buffer
dtest.save_binary('dtest.buffer') dtest.save_binary("dtest.dmatrix")
# save model # save model
bst.save_model('xgb.model') bst.save_model("model-1.json")
# load model and data in # load model and data in
bst2 = xgb.Booster(model_file='xgb.model') bst2 = xgb.Booster(model_file="model-1.json")
dtest2 = xgb.DMatrix('dtest.buffer') dtest2 = xgb.DMatrix("dtest.dmatrix")
preds2 = bst2.predict(dtest2) preds2 = bst2.predict(dtest2)
# assert they are the same # assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0 assert np.sum(np.abs(preds2 - preds)) == 0
@ -56,40 +71,3 @@ bst3 = pickle.loads(pks)
preds3 = bst3.predict(dtest2) preds3 = bst3.predict(dtest2)
# assert they are the same # assert they are the same
assert np.sum(np.abs(preds3 - preds)) == 0 assert np.sum(np.abs(preds3 - preds)) == 0
###
# build dmatrix from scipy.sparse
print('start running example of build DMatrix from scipy.sparse CSR Matrix')
labels = []
row = []
col = []
dat = []
i = 0
for l in open(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train')):
arr = l.split()
labels.append(int(arr[0]))
for it in arr[1:]:
k, v = it.split(':')
row.append(i)
col.append(int(k))
dat.append(float(v))
i += 1
csr = scipy.sparse.csr_matrix((dat, (row, col)))
dtrain = xgb.DMatrix(csr, label=labels)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, watchlist)
print('start running example of build DMatrix from scipy.sparse CSC Matrix')
# we can also construct from csc matrix
csc = scipy.sparse.csc_matrix((dat, (row, col)))
dtrain = xgb.DMatrix(csc, label=labels)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, watchlist)
print('start running example of build DMatrix from numpy array')
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix
# in internal implementation then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix(npymat, label=labels)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, watchlist)

View File

@ -45,6 +45,7 @@ including:
- XGBoost binary buffer file. - XGBoost binary buffer file.
- LIBSVM text format file - LIBSVM text format file
- Comma-separated values (CSV) file - Comma-separated values (CSV) file
- Arrow table.
(See :doc:`/tutorials/input_format` for detailed description of text input format.) (See :doc:`/tutorials/input_format` for detailed description of text input format.)

View File

@ -565,12 +565,14 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
"""Parameters """Parameters
---------- ----------
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/ data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
dt.Frame/cudf.DataFrame/cupy.array/dlpack dt.Frame/cudf.DataFrame/cupy.array/dlpack/arrow.Table
Data source of DMatrix. Data source of DMatrix.
When data is string or os.PathLike type, it represents the path
libsvm format txt file, csv file (by specifying uri parameter When data is string or os.PathLike type, it represents the path libsvm
'path_to_csv?format=csv'), or binary file that xgboost can read format txt file, csv file (by specifying uri parameter
from. 'path_to_csv?format=csv'), or binary file that xgboost can read from.
label : array_like label : array_like
Label of the training data. Label of the training data.
weight : array_like weight : array_like