xgboost/plugin/updater_gpu/test/python/test_large.py

from __future__ import print_function
#pylint: skip-file
import sys
sys.path.append("../../tests/python")
import xgboost as xgb
import testing as tm
import numpy as np
import unittest
from sklearn.datasets import make_classification
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs) ; sys.stderr.flush()
    print(*args, file=sys.stdout, **kwargs) ; sys.stdout.flush()

eprint("Testing Big Data (this may take a while)")

rng = np.random.RandomState(1994)

# "realistic" size based upon http://stat-computing.org/dataexpo/2009/ , which has been processed to one-hot encode categoricalsxsy
cols = 31
# reduced to fit onto 1 gpu but still be large
rows2 = 5000 # medium
#rows2 = 4032 # fake large for testing
rows1 = 42360032 # large
#rows2 = 152360032 # can do this for multi-gpu test (very large)
rowslist = [rows1, rows2]


class TestGPU(unittest.TestCase):
    def test_large(self):
        eprint("Starting test for large data")
        tm._skip_if_no_sklearn()
        from sklearn.datasets import load_digits
        try:
            from sklearn.model_selection import train_test_split
        except:
            from sklearn.cross_validation import train_test_split


        for rows in rowslist:

            eprint("Creating train data rows=%d cols=%d" % (rows,cols))
            X, y = make_classification(rows, n_features=cols, random_state=7)
            rowstest = int(rows*0.2)
            eprint("Creating test data rows=%d cols=%d" % (rowstest,cols))
            # note the new random state.  if chose same as train random state, exact methods can memorize and do very well on test even for random data, while hist cannot
            Xtest, ytest = make_classification(rowstest, n_features=cols, random_state=8)

            eprint("Starting DMatrix(X,y)")
            ag_dtrain = xgb.DMatrix(X,y)
            eprint("Starting DMatrix(Xtest,ytest)")
            ag_dtest = xgb.DMatrix(Xtest,ytest)


            max_depth=6
            max_bin=1024

            # regression test --- hist must be same as exact on all-categorial data
            ag_param = {'max_depth': max_depth,
                        'tree_method': 'exact',
                        #'nthread': 1,
                        'eta': 1,
                        'silent': 0,
                        'objective': 'binary:logistic',
                        'eval_metric': 'auc'}
            ag_paramb = {'max_depth': max_depth,
                        'tree_method': 'hist',
                        #'nthread': 1,
                        'eta': 1,
                        'silent': 0,
                        'objective': 'binary:logistic',
                        'eval_metric': 'auc'}
            ag_param2 = {'max_depth': max_depth,
                         'tree_method': 'gpu_hist',
                         'eta': 1,
                         'silent': 0,
                         'n_gpus': 1,
                         'objective': 'binary:logistic',
                         'max_bin': max_bin,
                         'eval_metric': 'auc'}
            ag_param3 = {'max_depth': max_depth,
                         'tree_method': 'gpu_hist',
                         'eta': 1,
                         'silent': 0,
                         'n_gpus': -1,
                         'objective': 'binary:logistic',
                         'max_bin': max_bin,
                         'eval_metric': 'auc'}
            #ag_param4 = {'max_depth': max_depth,
            #             'tree_method': 'gpu_exact',
            #             'eta': 1,
            #             'silent': 0,
            #             'n_gpus': 1,
            #             'objective': 'binary:logistic',
            #             'max_bin': max_bin,
            #             'eval_metric': 'auc'}
            ag_res = {}
            ag_resb = {}
            ag_res2 = {}
            ag_res3 = {}
            #ag_res4 = {}

            num_rounds = 1

            eprint("normal updater")
            xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                      evals_result=ag_res)
            eprint("hist updater")
            xgb.train(ag_paramb, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                      evals_result=ag_resb)
            eprint("gpu_hist updater 1 gpu")
            xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                      evals_result=ag_res2)
            eprint("gpu_hist updater all gpus")
            xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                      evals_result=ag_res3)
            #eprint("gpu_exact updater")
            #xgb.train(ag_param4, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
            #          evals_result=ag_res4)

            assert np.fabs(ag_res['train']['auc'][0] - ag_resb['train']['auc'][0])<0.001
            assert np.fabs(ag_res['train']['auc'][0] - ag_res2['train']['auc'][0])<0.001
            assert np.fabs(ag_res['train']['auc'][0] - ag_res3['train']['auc'][0])<0.001
            #assert np.fabs(ag_res['train']['auc'][0] - ag_res4['train']['auc'][0])<0.001

            assert np.fabs(ag_res['test']['auc'][0] - ag_resb['test']['auc'][0])<0.01
            assert np.fabs(ag_res['test']['auc'][0] - ag_res2['test']['auc'][0])<0.01
            assert np.fabs(ag_res['test']['auc'][0] - ag_res3['test']['auc'][0])<0.01
            #assert np.fabs(ag_res['test']['auc'][0] - ag_res4['test']['auc'][0])<0.01