Remove text loading in basic walk through demo. (#7753)
This commit is contained in:
parent
c467e90ac1
commit
bcce17e688
@ -1,50 +1,65 @@
|
||||
"""
|
||||
Getting started with XGBoost
|
||||
============================
|
||||
|
||||
This is a simple example of using the native XGBoost interface, there are other
|
||||
interfaces in the Python package like scikit-learn interface and Dask interface.
|
||||
|
||||
|
||||
See :doc:`/python/python_intro` and :doc:`/tutorials/index` for other references.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
import pickle
|
||||
import xgboost as xgb
|
||||
import os
|
||||
|
||||
from sklearn.datasets import load_svmlight_file
|
||||
|
||||
# Make sure the demo knows where to load the data.
|
||||
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
XGBOOST_ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
|
||||
DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, 'demo')
|
||||
DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, "demo")
|
||||
|
||||
# simple example
|
||||
# load file from text file, also binary buffer generated by xgboost
|
||||
dtrain = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train?indexing_mode=1'))
|
||||
dtest = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.test?indexing_mode=1'))
|
||||
# X is a scipy csr matrix, XGBoost supports many other input types,
|
||||
X, y = load_svmlight_file(os.path.join(DEMO_DIR, "data", "agaricus.txt.train"))
|
||||
dtrain = xgb.DMatrix(X, y)
|
||||
# validation set
|
||||
X_test, y_test = load_svmlight_file(os.path.join(DEMO_DIR, "data", "agaricus.txt.test"))
|
||||
dtest = xgb.DMatrix(X_test, y_test)
|
||||
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
|
||||
param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
|
||||
|
||||
# specify validations set to watch performance
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
watchlist = [(dtest, "eval"), (dtrain, "train")]
|
||||
# number of boosting rounds
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
bst = xgb.train(param, dtrain, num_boost_round=num_round, evals=watchlist)
|
||||
|
||||
# this is prediction
|
||||
# run prediction
|
||||
preds = bst.predict(dtest)
|
||||
labels = dtest.get_label()
|
||||
print('error=%f' %
|
||||
(sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) /
|
||||
float(len(preds))))
|
||||
bst.save_model('0001.model')
|
||||
print(
|
||||
"error=%f"
|
||||
% (
|
||||
sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
|
||||
/ float(len(preds))
|
||||
)
|
||||
)
|
||||
bst.save_model("model-0.json")
|
||||
# dump model
|
||||
bst.dump_model('dump.raw.txt')
|
||||
bst.dump_model("dump.raw.txt")
|
||||
# dump model with feature map
|
||||
bst.dump_model('dump.nice.txt', os.path.join(DEMO_DIR, 'data/featmap.txt'))
|
||||
bst.dump_model("dump.nice.txt", os.path.join(DEMO_DIR, "data/featmap.txt"))
|
||||
|
||||
# save dmatrix into binary buffer
|
||||
dtest.save_binary('dtest.buffer')
|
||||
dtest.save_binary("dtest.dmatrix")
|
||||
# save model
|
||||
bst.save_model('xgb.model')
|
||||
bst.save_model("model-1.json")
|
||||
# load model and data in
|
||||
bst2 = xgb.Booster(model_file='xgb.model')
|
||||
dtest2 = xgb.DMatrix('dtest.buffer')
|
||||
bst2 = xgb.Booster(model_file="model-1.json")
|
||||
dtest2 = xgb.DMatrix("dtest.dmatrix")
|
||||
preds2 = bst2.predict(dtest2)
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds2 - preds)) == 0
|
||||
@ -56,40 +71,3 @@ bst3 = pickle.loads(pks)
|
||||
preds3 = bst3.predict(dtest2)
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds3 - preds)) == 0
|
||||
|
||||
###
|
||||
# build dmatrix from scipy.sparse
|
||||
print('start running example of build DMatrix from scipy.sparse CSR Matrix')
|
||||
labels = []
|
||||
row = []
|
||||
col = []
|
||||
dat = []
|
||||
i = 0
|
||||
for l in open(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train')):
|
||||
arr = l.split()
|
||||
labels.append(int(arr[0]))
|
||||
for it in arr[1:]:
|
||||
k, v = it.split(':')
|
||||
row.append(i)
|
||||
col.append(int(k))
|
||||
dat.append(float(v))
|
||||
i += 1
|
||||
csr = scipy.sparse.csr_matrix((dat, (row, col)))
|
||||
dtrain = xgb.DMatrix(csr, label=labels)
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
print('start running example of build DMatrix from scipy.sparse CSC Matrix')
|
||||
# we can also construct from csc matrix
|
||||
csc = scipy.sparse.csc_matrix((dat, (row, col)))
|
||||
dtrain = xgb.DMatrix(csc, label=labels)
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
print('start running example of build DMatrix from numpy array')
|
||||
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix
|
||||
# in internal implementation then convert to DMatrix
|
||||
npymat = csr.todense()
|
||||
dtrain = xgb.DMatrix(npymat, label=labels)
|
||||
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
|
||||
@ -45,6 +45,7 @@ including:
|
||||
- XGBoost binary buffer file.
|
||||
- LIBSVM text format file
|
||||
- Comma-separated values (CSV) file
|
||||
- Arrow table.
|
||||
|
||||
(See :doc:`/tutorials/input_format` for detailed description of text input format.)
|
||||
|
||||
|
||||
@ -565,12 +565,14 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
"""Parameters
|
||||
----------
|
||||
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
|
||||
dt.Frame/cudf.DataFrame/cupy.array/dlpack
|
||||
dt.Frame/cudf.DataFrame/cupy.array/dlpack/arrow.Table
|
||||
|
||||
Data source of DMatrix.
|
||||
When data is string or os.PathLike type, it represents the path
|
||||
libsvm format txt file, csv file (by specifying uri parameter
|
||||
'path_to_csv?format=csv'), or binary file that xgboost can read
|
||||
from.
|
||||
|
||||
When data is string or os.PathLike type, it represents the path libsvm
|
||||
format txt file, csv file (by specifying uri parameter
|
||||
'path_to_csv?format=csv'), or binary file that xgboost can read from.
|
||||
|
||||
label : array_like
|
||||
Label of the training data.
|
||||
weight : array_like
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user