[GPU-Plugin] Fix gpu_hist to allow matrices with more than just 2^{32} elements. Also fixed CPU hist algorithm. (#2518)
This commit is contained in:
committed by
Rory Mitchell
parent
c85bf9859e
commit
ca7fc9fda3
134
plugin/updater_gpu/test/python/test_large.py
Normal file
134
plugin/updater_gpu/test/python/test_large.py
Normal file
@@ -0,0 +1,134 @@
|
||||
from __future__ import print_function
|
||||
#pylint: skip-file
|
||||
import sys
|
||||
sys.path.append("../../tests/python")
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import numpy as np
|
||||
import unittest
|
||||
from sklearn.datasets import make_classification
|
||||
def eprint(*args, **kwargs):
|
||||
print(*args, file=sys.stderr, **kwargs) ; sys.stderr.flush()
|
||||
print(*args, file=sys.stdout, **kwargs) ; sys.stdout.flush()
|
||||
|
||||
eprint("Testing Big Data (this may take a while)")
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
# "realistic" size based upon http://stat-computing.org/dataexpo/2009/ , which has been processed to one-hot encode categoricalsxsy
|
||||
cols = 31
|
||||
# reduced to fit onto 1 gpu but still be large
|
||||
rows2 = 5000 # medium
|
||||
#rows2 = 4032 # fake large for testing
|
||||
rows1 = 42360032 # large
|
||||
#rows2 = 152360032 # can do this for multi-gpu test (very large)
|
||||
rowslist = [rows1, rows2]
|
||||
|
||||
|
||||
class TestGPU(unittest.TestCase):
|
||||
def test_large(self):
|
||||
eprint("Starting test for large data")
|
||||
tm._skip_if_no_sklearn()
|
||||
from sklearn.datasets import load_digits
|
||||
try:
|
||||
from sklearn.model_selection import train_test_split
|
||||
except:
|
||||
from sklearn.cross_validation import train_test_split
|
||||
|
||||
|
||||
for rows in rowslist:
|
||||
|
||||
eprint("Creating train data rows=%d cols=%d" % (rows,cols))
|
||||
X, y = make_classification(rows, n_features=cols, random_state=7)
|
||||
rowstest = int(rows*0.2)
|
||||
eprint("Creating test data rows=%d cols=%d" % (rowstest,cols))
|
||||
# note the new random state. if chose same as train random state, exact methods can memorize and do very well on test even for random data, while hist cannot
|
||||
Xtest, ytest = make_classification(rowstest, n_features=cols, random_state=8)
|
||||
|
||||
eprint("Starting DMatrix(X,y)")
|
||||
ag_dtrain = xgb.DMatrix(X,y)
|
||||
eprint("Starting DMatrix(Xtest,ytest)")
|
||||
ag_dtest = xgb.DMatrix(Xtest,ytest)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
max_depth=6
|
||||
max_bin=1024
|
||||
|
||||
# regression test --- hist must be same as exact on all-categorial data
|
||||
ag_param = {'max_depth': max_depth,
|
||||
'tree_method': 'exact',
|
||||
#'nthread': 1,
|
||||
'eta': 1,
|
||||
'silent': 0,
|
||||
'objective': 'binary:logistic',
|
||||
'eval_metric': 'auc'}
|
||||
ag_paramb = {'max_depth': max_depth,
|
||||
'tree_method': 'hist',
|
||||
#'nthread': 1,
|
||||
'eta': 1,
|
||||
'silent': 0,
|
||||
'objective': 'binary:logistic',
|
||||
'eval_metric': 'auc'}
|
||||
ag_param2 = {'max_depth': max_depth,
|
||||
'tree_method': 'gpu_hist',
|
||||
'eta': 1,
|
||||
'silent': 0,
|
||||
'n_gpus': 1,
|
||||
'objective': 'binary:logistic',
|
||||
'max_bin': max_bin,
|
||||
'eval_metric': 'auc'}
|
||||
ag_param3 = {'max_depth': max_depth,
|
||||
'tree_method': 'gpu_hist',
|
||||
'eta': 1,
|
||||
'silent': 0,
|
||||
'n_gpus': -1,
|
||||
'objective': 'binary:logistic',
|
||||
'max_bin': max_bin,
|
||||
'eval_metric': 'auc'}
|
||||
#ag_param4 = {'max_depth': max_depth,
|
||||
# 'tree_method': 'gpu_exact',
|
||||
# 'eta': 1,
|
||||
# 'silent': 0,
|
||||
# 'n_gpus': 1,
|
||||
# 'objective': 'binary:logistic',
|
||||
# 'max_bin': max_bin,
|
||||
# 'eval_metric': 'auc'}
|
||||
ag_res = {}
|
||||
ag_resb = {}
|
||||
ag_res2 = {}
|
||||
ag_res3 = {}
|
||||
#ag_res4 = {}
|
||||
|
||||
num_rounds = 1
|
||||
|
||||
eprint("normal updater")
|
||||
xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
||||
evals_result=ag_res)
|
||||
eprint("hist updater")
|
||||
xgb.train(ag_paramb, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
||||
evals_result=ag_resb)
|
||||
eprint("gpu_hist updater 1 gpu")
|
||||
xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
||||
evals_result=ag_res2)
|
||||
eprint("gpu_hist updater all gpus")
|
||||
xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
||||
evals_result=ag_res3)
|
||||
#eprint("gpu_exact updater")
|
||||
#xgb.train(ag_param4, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
|
||||
# evals_result=ag_res4)
|
||||
|
||||
assert np.fabs(ag_res['train']['auc'][0] - ag_resb['train']['auc'][0])<0.001
|
||||
assert np.fabs(ag_res['train']['auc'][0] - ag_res2['train']['auc'][0])<0.001
|
||||
assert np.fabs(ag_res['train']['auc'][0] - ag_res3['train']['auc'][0])<0.001
|
||||
#assert np.fabs(ag_res['train']['auc'][0] - ag_res4['train']['auc'][0])<0.001
|
||||
|
||||
assert np.fabs(ag_res['test']['auc'][0] - ag_resb['test']['auc'][0])<0.01
|
||||
assert np.fabs(ag_res['test']['auc'][0] - ag_res2['test']['auc'][0])<0.01
|
||||
assert np.fabs(ag_res['test']['auc'][0] - ag_res3['test']['auc'][0])<0.01
|
||||
#assert np.fabs(ag_res['test']['auc'][0] - ag_res4['test']['auc'][0])<0.01
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user