135 lines
5.6 KiB
Python
135 lines
5.6 KiB
Python
from __future__ import print_function
|
|
#pylint: skip-file
|
|
import sys
|
|
sys.path.append("../../tests/python")
|
|
import xgboost as xgb
|
|
import testing as tm
|
|
import numpy as np
|
|
import unittest
|
|
from sklearn.datasets import make_classification
|
|
def eprint(*args, **kwargs):
|
|
print(*args, file=sys.stderr, **kwargs) ; sys.stderr.flush()
|
|
print(*args, file=sys.stdout, **kwargs) ; sys.stdout.flush()
|
|
|
|
eprint("Testing Big Data (this may take a while)")
|
|
|
|
rng = np.random.RandomState(1994)
|
|
|
|
# "realistic" size based upon http://stat-computing.org/dataexpo/2009/ , which has been processed to one-hot encode categoricalsxsy
|
|
cols = 31
|
|
# reduced to fit onto 1 gpu but still be large
|
|
rows2 = 5000 # medium
|
|
#rows2 = 4032 # fake large for testing
|
|
rows1 = 42360032 # large
|
|
#rows2 = 152360032 # can do this for multi-gpu test (very large)
|
|
rowslist = [rows1, rows2]
|
|
|
|
|
|
class TestGPU(unittest.TestCase):
|
|
def test_large(self):
|
|
eprint("Starting test for large data")
|
|
tm._skip_if_no_sklearn()
|
|
from sklearn.datasets import load_digits
|
|
try:
|
|
from sklearn.model_selection import train_test_split
|
|
except:
|
|
from sklearn.cross_validation import train_test_split
|
|
|
|
|
|
for rows in rowslist:
|
|
|
|
eprint("Creating train data rows=%d cols=%d" % (rows,cols))
|
|
X, y = make_classification(rows, n_features=cols, random_state=7)
|
|
rowstest = int(rows*0.2)
|
|
eprint("Creating test data rows=%d cols=%d" % (rowstest,cols))
|
|
# note the new random state. if chose same as train random state, exact methods can memorize and do very well on test even for random data, while hist cannot
|
|
Xtest, ytest = make_classification(rowstest, n_features=cols, random_state=8)
|
|
|
|
eprint("Starting DMatrix(X,y)")
|
|
ag_dtrain = xgb.DMatrix(X,y)
|
|
eprint("Starting DMatrix(Xtest,ytest)")
|
|
ag_dtest = xgb.DMatrix(Xtest,ytest)
|
|
|
|
|
|
|
|
|
|
|
|
max_depth=6
|
|
max_bin=1024
|
|
|
|
# regression test --- hist must be same as exact on all-categorial data
|
|
ag_param = {'max_depth': max_depth,
|
|
'tree_method': 'exact',
|
|
#'nthread': 1,
|
|
'eta': 1,
|
|
'silent': 0,
|
|
'objective': 'binary:logistic',
|
|
'eval_metric': 'auc'}
|
|
ag_paramb = {'max_depth': max_depth,
|
|
'tree_method': 'hist',
|
|
#'nthread': 1,
|
|
'eta': 1,
|
|
'silent': 0,
|
|
'objective': 'binary:logistic',
|
|
'eval_metric': 'auc'}
|
|
ag_param2 = {'max_depth': max_depth,
|
|
'tree_method': 'gpu_hist',
|
|
'eta': 1,
|
|
'silent': 0,
|
|
'n_gpus': 1,
|
|
'objective': 'binary:logistic',
|
|
'max_bin': max_bin,
|
|
'eval_metric': 'auc'}
|
|
ag_param3 = {'max_depth': max_depth,
|
|
'tree_method': 'gpu_hist',
|
|
'eta': 1,
|
|
'silent': 0,
|
|
'n_gpus': -1,
|
|
'objective': 'binary:logistic',
|
|
'max_bin': max_bin,
|
|
'eval_metric': 'auc'}
|
|
#ag_param4 = {'max_depth': max_depth,
|
|
# 'tree_method': 'gpu_exact',
|
|
# 'eta': 1,
|
|
# 'silent': 0,
|
|
# 'n_gpus': 1,
|
|
# 'objective': 'binary:logistic',
|
|
# 'max_bin': max_bin,
|
|
# 'eval_metric': 'auc'}
|
|
ag_res = {}
|
|
ag_resb = {}
|
|
ag_res2 = {}
|
|
ag_res3 = {}
|
|
#ag_res4 = {}
|
|
|
|
num_rounds = 1
|
|
|
|
eprint("normal updater")
|
|
xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
|
evals_result=ag_res)
|
|
eprint("hist updater")
|
|
xgb.train(ag_paramb, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
|
evals_result=ag_resb)
|
|
eprint("gpu_hist updater 1 gpu")
|
|
xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
|
evals_result=ag_res2)
|
|
eprint("gpu_hist updater all gpus")
|
|
xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
|
evals_result=ag_res3)
|
|
#eprint("gpu_exact updater")
|
|
#xgb.train(ag_param4, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
|
# evals_result=ag_res4)
|
|
|
|
assert np.fabs(ag_res['train']['auc'][0] - ag_resb['train']['auc'][0])<0.001
|
|
assert np.fabs(ag_res['train']['auc'][0] - ag_res2['train']['auc'][0])<0.001
|
|
assert np.fabs(ag_res['train']['auc'][0] - ag_res3['train']['auc'][0])<0.001
|
|
#assert np.fabs(ag_res['train']['auc'][0] - ag_res4['train']['auc'][0])<0.001
|
|
|
|
assert np.fabs(ag_res['test']['auc'][0] - ag_resb['test']['auc'][0])<0.01
|
|
assert np.fabs(ag_res['test']['auc'][0] - ag_res2['test']['auc'][0])<0.01
|
|
assert np.fabs(ag_res['test']['auc'][0] - ag_res3['test']['auc'][0])<0.01
|
|
#assert np.fabs(ag_res['test']['auc'][0] - ag_res4['test']['auc'][0])<0.01
|
|
|
|
|
|
|