xgboost/plugin/updater_gpu/test/python/test_large.py

from __future__ import print_function
#pylint: skip-file
import sys
import time
sys.path.append("../../tests/python")
import xgboost as xgb
import testing as tm
import numpy as np
import unittest
from sklearn.datasets import make_classification
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs) ; sys.stderr.flush()
    print(*args, file=sys.stdout, **kwargs) ; sys.stdout.flush()

eprint("Testing Big Data (this may take a while)")

rng = np.random.RandomState(1994)

# "realistic" size based upon http://stat-computing.org/dataexpo/2009/ , which has been processed to one-hot encode categoricalsxsy
cols = 31
# reduced to fit onto 1 gpu but still be large
rows3 = 5000 # small
rows2 = 4360032 # medium
rows1 = 42360032 # large
#rows1 = 152360032 # can do this for multi-gpu test (very large)
rowslist = [rows1, rows2, rows3]


class TestGPU(unittest.TestCase):
    def test_large(self):
        eprint("Starting test for large data")
        tm._skip_if_no_sklearn()

        for rows in rowslist:

            eprint("Creating train data rows=%d cols=%d" % (rows,cols))
            tmp = time.time()
            np.random.seed(7)
            X = np.random.rand(rows, cols)
            y = np.random.rand(rows)
            print("Time to Create Data: %r" % (time.time() - tmp))

            eprint("Starting DMatrix(X,y)")
            tmp = time.time()
            ag_dtrain = xgb.DMatrix(X,y,nthread=40)
            print("Time to DMatrix: %r" % (time.time() - tmp))

            max_depth=6
            max_bin=1024

            # regression test --- hist must be same as exact on all-categorial data
            ag_param = {'max_depth': max_depth,
                        'tree_method': 'exact',
                        'nthread': 0,
                        'eta': 1,
                        'silent': 0,
                        'debug_verbose': 5,
                        'objective': 'binary:logistic',
                        'eval_metric': 'auc'}
            ag_paramb = {'max_depth': max_depth,
                        'tree_method': 'hist',
                        'nthread': 0,
                        'eta': 1,
                        'silent': 0,
                        'debug_verbose': 5,
                        'objective': 'binary:logistic',
                        'eval_metric': 'auc'}
            ag_param2 = {'max_depth': max_depth,
                        'tree_method': 'gpu_hist',
                        'nthread': 0,
                        'eta': 1,
                        'silent': 0,
                        'debug_verbose': 5,
                        'n_gpus': 1,
                        'objective': 'binary:logistic',
                        'max_bin': max_bin,
                        'eval_metric': 'auc'}
            ag_param3 = {'max_depth': max_depth,
                         'tree_method': 'gpu_hist',
                         'nthread': 0,
                         'eta': 1,
                         'silent': 0,
                         'debug_verbose': 5,
                         'n_gpus': -1,
                         'objective': 'binary:logistic',
                         'max_bin': max_bin,
                         'eval_metric': 'auc'}
            ag_res = {}
            ag_resb = {}
            ag_res2 = {}
            ag_res3 = {}

            num_rounds = 1
            tmp = time.time()
            #eprint("hist updater")
            #xgb.train(ag_paramb, ag_dtrain, num_rounds, [(ag_dtrain, 'train')],
            #          evals_result=ag_resb)
            #print("Time to Train: %s seconds" % (str(time.time() - tmp)))

            tmp = time.time()
            eprint("gpu_hist updater 1 gpu")
            xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train')],
                      evals_result=ag_res2)
            print("Time to Train: %s seconds" % (str(time.time() - tmp)))

            tmp = time.time()
            eprint("gpu_hist updater all gpus")
            xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train')],
                      evals_result=ag_res3)
            print("Time to Train: %s seconds" % (str(time.time() - tmp)))