Multi-threaded XGDMatrixCreateFromMat for faster DMatrix creation (#2530)

* Multi-threaded XGDMatrixCreateFromMat for faster DMatrix creation from numpy arrays for python interface.
2017-07-20 19:43:17 -07:00
parent 56550ff3f1
commit 6b375f6ad8
9 changed files with 324 additions and 73 deletions
--- a/plugin/updater_gpu/benchmark/benchmark.py
+++ b/plugin/updater_gpu/benchmark/benchmark.py
@@ -5,13 +5,18 @@ import numpy as np
 from sklearn.datasets import make_classification
 import time

-n = 1000000
-num_rounds = 500
-
 def run_benchmark(args, gpu_algorithm, cpu_algorithm):
    print("Generating dataset: {} rows * {} columns".format(args.rows,args.columns))
+    tmp = time.time()
    X, y = make_classification(args.rows, n_features=args.columns, random_state=7)
-    dtrain = xgb.DMatrix(X, y)
+    print ("Generate Time: %s seconds" % (str(time.time() - tmp)))
+    tmp = time.time()
+    print ("DMatrix Start")
+    # omp way
+    dtrain = xgb.DMatrix(X, y, nthread=-1)
+    # non-omp way
+    #dtrain = xgb.DMatrix(X, y)
+    print ("DMatrix Time: %s seconds" % (str(time.time() - tmp)))

    param = {'objective': 'binary:logistic',
             'max_depth': 6,
@@ -24,7 +29,7 @@ def run_benchmark(args, gpu_algorithm, cpu_algorithm):
    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations)
-    print ("Time: %s seconds" % (str(time.time() - tmp)))
+    print ("Train Time: %s seconds" % (str(time.time() - tmp)))

    param['silent'] = 1
    param['tree_method'] = cpu_algorithm
--- a/plugin/updater_gpu/src/gpu_hist_builder.cu
+++ b/plugin/updater_gpu/src/gpu_hist_builder.cu
@@ -343,8 +343,6 @@ void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
 }

 void GPUHistBuilder::BuildHist(int depth) {
-  //  dh::Timer time;
-
  for (int d_idx = 0; d_idx < n_devices; d_idx++) {
    int device_idx = dList[d_idx];
    size_t begin = device_element_segments[d_idx];
@@ -1070,9 +1068,9 @@ void GPUHistBuilder::Update(const std::vector<bst_gpair>& gpair,
  this->InitData(gpair, *p_fmat, *p_tree);
  this->InitFirstNode(gpair);
  this->ColSampleTree();
+
  for (int depth = 0; depth < param.max_depth; depth++) {
    this->ColSampleLevel();
-
    this->BuildHist(depth);
    this->FindSplit(depth);
    this->UpdatePosition(depth);
--- a/plugin/updater_gpu/test/python/test.py
+++ b/plugin/updater_gpu/test/python/test.py
@@ -29,13 +29,14 @@ class TestGPU(unittest.TestCase):

        ag_param = {'max_depth': 2,
                    'tree_method': 'exact',
-                    'nthread': 1,
+                    'nthread': 0,
                    'eta': 1,
                    'silent': 1,
                    'objective': 'binary:logistic',
                    'eval_metric': 'auc'}
        ag_param2 = {'max_depth': 2,
                     'tree_method': 'gpu_exact',
+                     'nthread': 0,
                     'eta': 1,
                     'silent': 1,
                     'objective': 'binary:logistic',
@@ -59,6 +60,7 @@ class TestGPU(unittest.TestCase):
        dtest = xgb.DMatrix(X_test, y_test)

        param = {'objective': 'binary:logistic',
+                 'nthread': 0,
                 'tree_method': 'gpu_exact',
                 'max_depth': 3,
                 'eval_metric': 'auc'}
@@ -75,6 +77,7 @@ class TestGPU(unittest.TestCase):
        dtrain2 = xgb.DMatrix(X2, label=y2)

        param = {'objective': 'binary:logistic',
+                 'nthread': 0,
                 'tree_method': 'gpu_exact',
                 'max_depth': 2,
                 'eval_metric': 'auc'}
@@ -128,26 +131,28 @@ class TestGPU(unittest.TestCase):
                # regression test --- hist must be same as exact on all-categorial data
                ag_param = {'max_depth': max_depth,
                            'tree_method': 'exact',
-                            'nthread': 1,
+                            'nthread': 0,
                            'eta': 1,
                            'silent': 1,
                            'objective': 'binary:logistic',
                            'eval_metric': 'auc'}
                ag_param2 = {'max_depth': max_depth,
+                             'nthread': 0,
                             'tree_method': 'gpu_hist',
                             'eta': 1,
                             'silent': 1,
                             'n_gpus': 1,
                             'objective': 'binary:logistic',
-                                 'max_bin': max_bin,
+                             'max_bin': max_bin,
                             'eval_metric': 'auc'}
                ag_param3 = {'max_depth': max_depth,
+                             'nthread': 0,
                             'tree_method': 'gpu_hist',
                             'eta': 1,
                             'silent': 1,
                             'n_gpus': n_gpus,
-                                 'objective': 'binary:logistic',
-                                 'max_bin': max_bin,
+                             'objective': 'binary:logistic',
+                             'max_bin': max_bin,
                             'eval_metric': 'auc'}
                ag_res = {}
                ag_res2 = {}
@@ -178,6 +183,7 @@ class TestGPU(unittest.TestCase):

                param = {'objective': 'binary:logistic',
                         'tree_method': 'gpu_hist',
+                         'nthread': 0,
                         'max_depth': max_depth,
                         'n_gpus': 1,
                         'max_bin': max_bin,
@@ -189,6 +195,7 @@ class TestGPU(unittest.TestCase):
                assert self.non_decreasing(res['train']['auc'])
                #assert self.non_decreasing(res['test']['auc'])
                param2 = {'objective': 'binary:logistic',
+                          'nthread': 0,
                          'tree_method': 'gpu_hist',
                          'max_depth': max_depth,
                          'n_gpus': n_gpus,
@@ -211,6 +218,7 @@ class TestGPU(unittest.TestCase):
                dtrain2 = xgb.DMatrix(X2, label=y2)

                param = {'objective': 'binary:logistic',
+                         'nthread': 0,
                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
@@ -250,6 +258,7 @@ class TestGPU(unittest.TestCase):
                ######################################################################
                # fail-safe test for max_bin
                param = {'objective': 'binary:logistic',
+                         'nthread': 0,
                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
@@ -263,6 +272,7 @@ class TestGPU(unittest.TestCase):
                ######################################################################
                # subsampling
                param = {'objective': 'binary:logistic',
+                         'nthread': 0,
                         'tree_method': 'gpu_hist',
                         'max_depth': max_depth,
                         'n_gpus': n_gpus,
@@ -279,6 +289,7 @@ class TestGPU(unittest.TestCase):
        ######################################################################
        # fail-safe test for max_bin=2
        param = {'objective': 'binary:logistic',
+                 'nthread': 0,
                 'tree_method': 'gpu_hist',
                 'max_depth': 2,
                 'n_gpus': n_gpus,
--- a/plugin/updater_gpu/test/python/test_large.py
+++ b/plugin/updater_gpu/test/python/test_large.py
@@ -18,62 +18,48 @@ rng = np.random.RandomState(1994)
 # "realistic" size based upon http://stat-computing.org/dataexpo/2009/ , which has been processed to one-hot encode categoricalsxsy
 cols = 31
 # reduced to fit onto 1 gpu but still be large
-rows2 = 5000 # medium
-#rows2 = 4032 # fake large for testing
+rows3 = 5000 # small
+rows2 = 4360032 # medium
 rows1 = 42360032 # large
-#rows2 = 152360032 # can do this for multi-gpu test (very large)
-rowslist = [rows1, rows2]
+#rows1 = 152360032 # can do this for multi-gpu test (very large)
+rowslist = [rows1, rows2, rows3]


 class TestGPU(unittest.TestCase):
    def test_large(self):
        eprint("Starting test for large data")
        tm._skip_if_no_sklearn()
-        from sklearn.datasets import load_digits
-        try:
-            from sklearn.model_selection import train_test_split
-        except:
-            from sklearn.cross_validation import train_test_split
-

        for rows in rowslist:
            
            eprint("Creating train data rows=%d cols=%d" % (rows,cols))
-            X, y = make_classification(rows, n_features=cols, random_state=7)
-            rowstest = int(rows*0.2)
-            eprint("Creating test data rows=%d cols=%d" % (rowstest,cols))
-            # note the new random state.  if chose same as train random state, exact methods can memorize and do very well on test even for random data, while hist cannot                     
-            Xtest, ytest = make_classification(rowstest, n_features=cols, random_state=8)
-            
+            np.random.seed(7)
+            X = np.random.rand(rows, cols)
+            y = np.random.rand(rows)
            eprint("Starting DMatrix(X,y)")
-            ag_dtrain = xgb.DMatrix(X,y)
-            eprint("Starting DMatrix(Xtest,ytest)")
-            ag_dtest = xgb.DMatrix(Xtest,ytest)
+            ag_dtrain = xgb.DMatrix(X,y,nthread=0)

-    
-    
-            
-    
            max_depth=6
            max_bin=1024
                
            # regression test --- hist must be same as exact on all-categorial data
            ag_param = {'max_depth': max_depth,
                        'tree_method': 'exact',
-                        #'nthread': 1,
+                        'nthread': 0,
                        'eta': 1,
                        'silent': 0,
                        'objective': 'binary:logistic',
                        'eval_metric': 'auc'}
            ag_paramb = {'max_depth': max_depth,
                        'tree_method': 'hist',
-                        #'nthread': 1,
+                        'nthread': 0,
                        'eta': 1,
                        'silent': 0,
                        'objective': 'binary:logistic',
                        'eval_metric': 'auc'}
            ag_param2 = {'max_depth': max_depth,
                         'tree_method': 'gpu_hist',
+                         'nthread': 0,
                         'eta': 1,
                         'silent': 0,
                         'n_gpus': 1,
@@ -82,26 +68,18 @@ class TestGPU(unittest.TestCase):
                         'eval_metric': 'auc'}
            ag_param3 = {'max_depth': max_depth,
                         'tree_method': 'gpu_hist',
+                         'nthread': 0,
                         'eta': 1,
                         'silent': 0,
                         'n_gpus': -1,
                         'objective': 'binary:logistic',
                         'max_bin': max_bin,
                         'eval_metric': 'auc'}
-            #ag_param4 = {'max_depth': max_depth,
-            #             'tree_method': 'gpu_exact',
-            #             'eta': 1,
-            #             'silent': 0,
-            #             'n_gpus': 1,
-            #             'objective': 'binary:logistic',
-            #             'max_bin': max_bin,
-            #             'eval_metric': 'auc'}
            ag_res = {}
            ag_resb = {}
            ag_res2 = {}
            ag_res3 = {}
-            #ag_res4 = {}
-    
+
            num_rounds = 1
            
            eprint("normal updater")
@@ -116,19 +94,10 @@ class TestGPU(unittest.TestCase):
            eprint("gpu_hist updater all gpus")
            xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                      evals_result=ag_res3)
-            #eprint("gpu_exact updater")
-            #xgb.train(ag_param4, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
-            #          evals_result=ag_res4)
-    
+
            assert np.fabs(ag_res['train']['auc'][0] - ag_resb['train']['auc'][0])<0.001
            assert np.fabs(ag_res['train']['auc'][0] - ag_res2['train']['auc'][0])<0.001
            assert np.fabs(ag_res['train']['auc'][0] - ag_res3['train']['auc'][0])<0.001
-            #assert np.fabs(ag_res['train']['auc'][0] - ag_res4['train']['auc'][0])<0.001
-    
-            assert np.fabs(ag_res['test']['auc'][0] - ag_resb['test']['auc'][0])<0.01
-            assert np.fabs(ag_res['test']['auc'][0] - ag_res2['test']['auc'][0])<0.01
-            assert np.fabs(ag_res['test']['auc'][0] - ag_res3['test']['auc'][0])<0.01
-            #assert np.fabs(ag_res['test']['auc'][0] - ag_res4['test']['auc'][0])<0.01
-    
+