from __future__ import print_function #pylint: skip-file import sys sys.path.append("../../tests/python") import xgboost as xgb import testing as tm import numpy as np import unittest from sklearn.datasets import make_classification def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) ; sys.stderr.flush() print(*args, file=sys.stdout, **kwargs) ; sys.stdout.flush() eprint("Testing Big Data (this may take a while)") rng = np.random.RandomState(1994) # "realistic" size based upon http://stat-computing.org/dataexpo/2009/ , which has been processed to one-hot encode categoricalsxsy cols = 31 # reduced to fit onto 1 gpu but still be large rows2 = 5000 # medium #rows2 = 4032 # fake large for testing rows1 = 42360032 # large #rows2 = 152360032 # can do this for multi-gpu test (very large) rowslist = [rows1, rows2] class TestGPU(unittest.TestCase): def test_large(self): eprint("Starting test for large data") tm._skip_if_no_sklearn() from sklearn.datasets import load_digits try: from sklearn.model_selection import train_test_split except: from sklearn.cross_validation import train_test_split for rows in rowslist: eprint("Creating train data rows=%d cols=%d" % (rows,cols)) X, y = make_classification(rows, n_features=cols, random_state=7) rowstest = int(rows*0.2) eprint("Creating test data rows=%d cols=%d" % (rowstest,cols)) # note the new random state. if chose same as train random state, exact methods can memorize and do very well on test even for random data, while hist cannot Xtest, ytest = make_classification(rowstest, n_features=cols, random_state=8) eprint("Starting DMatrix(X,y)") ag_dtrain = xgb.DMatrix(X,y) eprint("Starting DMatrix(Xtest,ytest)") ag_dtest = xgb.DMatrix(Xtest,ytest) max_depth=6 max_bin=1024 # regression test --- hist must be same as exact on all-categorial data ag_param = {'max_depth': max_depth, 'tree_method': 'exact', #'nthread': 1, 'eta': 1, 'silent': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc'} ag_paramb = {'max_depth': max_depth, 'tree_method': 'hist', #'nthread': 1, 'eta': 1, 'silent': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc'} ag_param2 = {'max_depth': max_depth, 'tree_method': 'gpu_hist', 'eta': 1, 'silent': 0, 'n_gpus': 1, 'objective': 'binary:logistic', 'max_bin': max_bin, 'eval_metric': 'auc'} ag_param3 = {'max_depth': max_depth, 'tree_method': 'gpu_hist', 'eta': 1, 'silent': 0, 'n_gpus': -1, 'objective': 'binary:logistic', 'max_bin': max_bin, 'eval_metric': 'auc'} #ag_param4 = {'max_depth': max_depth, # 'tree_method': 'gpu_exact', # 'eta': 1, # 'silent': 0, # 'n_gpus': 1, # 'objective': 'binary:logistic', # 'max_bin': max_bin, # 'eval_metric': 'auc'} ag_res = {} ag_resb = {} ag_res2 = {} ag_res3 = {} #ag_res4 = {} num_rounds = 1 eprint("normal updater") xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=ag_res) eprint("hist updater") xgb.train(ag_paramb, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=ag_resb) eprint("gpu_hist updater 1 gpu") xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=ag_res2) eprint("gpu_hist updater all gpus") xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=ag_res3) #eprint("gpu_exact updater") #xgb.train(ag_param4, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')], # evals_result=ag_res4) assert np.fabs(ag_res['train']['auc'][0] - ag_resb['train']['auc'][0])<0.001 assert np.fabs(ag_res['train']['auc'][0] - ag_res2['train']['auc'][0])<0.001 assert np.fabs(ag_res['train']['auc'][0] - ag_res3['train']['auc'][0])<0.001 #assert np.fabs(ag_res['train']['auc'][0] - ag_res4['train']['auc'][0])<0.001 assert np.fabs(ag_res['test']['auc'][0] - ag_resb['test']['auc'][0])<0.01 assert np.fabs(ag_res['test']['auc'][0] - ag_res2['test']['auc'][0])<0.01 assert np.fabs(ag_res['test']['auc'][0] - ag_res3['test']['auc'][0])<0.01 #assert np.fabs(ag_res['test']['auc'][0] - ag_res4['test']['auc'][0])<0.01