[GPU-Plugin] Major refactor 2 (#2664)

* Change cmake option * Move source files * Move google tests * Move python tests * Move benchmarks * Move documentation * Remove makefile support * Fix test run * Move GPU tests
2017-09-08 09:57:16 +12:00
parent 8244f6f120
commit 15267eedf2
21 changed files with 76 additions and 249 deletions
--- a/tests/benchmark/benchmark.py
+++ b/tests/benchmark/benchmark.py
@@ -0,0 +1,61 @@
+# pylint: skip-file
+import sys, argparse
+import xgboost as xgb
+import numpy as np
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+import time
+
+
+def run_benchmark(args, gpu_algorithm, cpu_algorithm):
+    print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
+    print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
+    tmp = time.time()
+    X, y = make_classification(args.rows, n_features=args.columns, random_state=7)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, random_state=7)
+    print ("Generate Time: %s seconds" % (str(time.time() - tmp)))
+    tmp = time.time()
+    print ("DMatrix Start")
+    # omp way
+    dtrain = xgb.DMatrix(X_train, y_train, nthread=-1)
+    dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
+    print ("DMatrix Time: %s seconds" % (str(time.time() - tmp)))
+
+    param = {'objective': 'binary:logistic',
+             'max_depth': 6,
+             'silent': 0,
+             'n_gpus': 1,
+             'gpu_id': 0,
+             'eval_metric': 'error',
+             'debug_verbose': 0,
+             }
+
+    param['tree_method'] = gpu_algorithm
+    print("Training with '%s'" % param['tree_method'])
+    tmp = time.time()
+    xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
+    print ("Train Time: %s seconds" % (str(time.time() - tmp)))
+
+    param['silent'] = 1
+    param['tree_method'] = cpu_algorithm
+    print("Training with '%s'" % param['tree_method'])
+    tmp = time.time()
+    xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
+    print ("Time: %s seconds" % (str(time.time() - tmp)))
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--algorithm', choices=['all', 'gpu_exact', 'gpu_hist'], default='all')
+parser.add_argument('--rows', type=int, default=1000000)
+parser.add_argument('--columns', type=int, default=50)
+parser.add_argument('--iterations', type=int, default=500)
+parser.add_argument('--test_size', type=float, default=0.25)
+args = parser.parse_args()
+
+if 'gpu_hist' in args.algorithm:
+    run_benchmark(args, args.algorithm, 'hist')
+elif 'gpu_exact' in args.algorithm:
+    run_benchmark(args, args.algorithm, 'exact')
+elif 'all' in args.algorithm:
+    run_benchmark(args, 'gpu_exact', 'exact')
+    run_benchmark(args, 'gpu_hist', 'hist')
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -0,0 +1,78 @@
+
+/*!
+ * Copyright 2017 XGBoost contributors
+ */
+#include <thrust/device_vector.h>
+#include <xgboost/base.h>
+#include "../../../src/common/device_helpers.cuh"
+#include "gtest/gtest.h"
+
+void CreateTestData(xgboost::bst_uint num_rows, int max_row_size,
+                    thrust::host_vector<int> *row_ptr,
+                    thrust::host_vector<xgboost::bst_uint> *rows) {
+  row_ptr->resize(num_rows + 1);
+  int sum = 0;
+  for (int i = 0; i <= num_rows; i++) {
+    (*row_ptr)[i] = sum;
+    sum += rand() % max_row_size;  // NOLINT
+
+    if (i < num_rows) {
+      for (int j = (*row_ptr)[i]; j < sum; j++) {
+        (*rows).push_back(i);
+      }
+    }
+  }
+}
+
+void SpeedTest() {
+  int num_rows = 1000000;
+  int max_row_size = 100;
+  dh::CubMemory temp_memory;
+  thrust::host_vector<int> h_row_ptr;
+  thrust::host_vector<xgboost::bst_uint> h_rows;
+  CreateTestData(num_rows, max_row_size, &h_row_ptr, &h_rows);
+  thrust::device_vector<int> row_ptr = h_row_ptr;
+  thrust::device_vector<int> output_row(h_rows.size());
+  auto d_output_row = output_row.data();
+
+  dh::Timer t;
+  dh::TransformLbs(
+      0, &temp_memory, h_rows.size(), dh::raw(row_ptr), row_ptr.size() - 1, false,
+      [=] __device__(size_t idx, size_t ridx) { d_output_row[idx] = ridx; });
+
+  dh::safe_cuda(cudaDeviceSynchronize());
+  double time = t.elapsedSeconds();
+  const int mb_size = 1048576;
+  size_t size = (sizeof(int) * h_rows.size()) / mb_size;
+  printf("size: %llumb, time: %fs, bandwidth: %fmb/s\n", size, time,
+         size / time);
+}
+
+void TestLbs() {
+  srand(17);
+  dh::CubMemory temp_memory;
+
+  std::vector<int> test_rows = {4, 100, 1000};
+  std::vector<int> test_max_row_sizes = {4, 100, 1300};
+
+  for (auto num_rows : test_rows) {
+    for (auto max_row_size : test_max_row_sizes) {
+      thrust::host_vector<int> h_row_ptr;
+      thrust::host_vector<xgboost::bst_uint> h_rows;
+      CreateTestData(num_rows, max_row_size, &h_row_ptr, &h_rows);
+      thrust::device_vector<size_t> row_ptr = h_row_ptr;
+      thrust::device_vector<int> output_row(h_rows.size());
+      auto d_output_row = output_row.data();
+
+      dh::TransformLbs(0, &temp_memory, h_rows.size(), dh::raw(row_ptr),
+                       row_ptr.size() - 1, false,
+                       [=] __device__(size_t idx, size_t ridx) {
+                         d_output_row[idx] = ridx;
+                       });
+
+      dh::safe_cuda(cudaDeviceSynchronize());
+      ASSERT_TRUE(h_rows == output_row);
+    }
+  }
+}
+TEST(cub_lbs, Test) { TestLbs(); }
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -0,0 +1,73 @@
+
+/*!
+ * Copyright 2017 XGBoost contributors
+ */
+#include <xgboost/c_api.h>
+#include <xgboost/predictor.h>
+#include "gtest/gtest.h"
+#include "../helpers.h"
+
+namespace xgboost {
+namespace predictor {
+TEST(gpu_predictor, Test) {
+  std::unique_ptr<Predictor> gpu_predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor"));
+  std::unique_ptr<Predictor> cpu_predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor"));
+
+  std::vector<std::unique_ptr<RegTree>> trees;
+  trees.push_back(std::make_unique<RegTree>());
+  trees.back()->InitModel();
+  (*trees.back())[0].set_leaf(1.5f);
+  gbm::GBTreeModel model(0.5);
+  model.CommitModel(std::move(trees), 0);
+  model.param.num_output_group = 1;
+
+  int n_row = 5;
+  int n_col = 5;
+
+  auto dmat = CreateDMatrix(n_row, n_col, 0);
+
+  // Test predict batch
+  std::vector<float> gpu_out_predictions;
+  std::vector<float> cpu_out_predictions;
+  gpu_predictor->PredictBatch(dmat.get(), &gpu_out_predictions, model, 0);
+  cpu_predictor->PredictBatch(dmat.get(), &cpu_out_predictions, model, 0);
+  float abs_tolerance = 0.001;
+  for (int i = 0; i < gpu_out_predictions.size(); i++) {
+    ASSERT_LT(std::abs(gpu_out_predictions[i] - cpu_out_predictions[i]),
+              abs_tolerance);
+  }
+
+  // Test predict instance
+  auto batch = dmat->RowIterator()->Value();
+  for (int i = 0; i < batch.size; i++) {
+    std::vector<float> gpu_instance_out_predictions;
+    std::vector<float> cpu_instance_out_predictions;
+    cpu_predictor->PredictInstance(batch[i], &cpu_instance_out_predictions,
+                                   model);
+    gpu_predictor->PredictInstance(batch[i], &gpu_instance_out_predictions,
+                                   model);
+    ASSERT_EQ(gpu_instance_out_predictions[0], cpu_instance_out_predictions[0]);
+  }
+
+  // Test predict leaf
+  std::vector<float> gpu_leaf_out_predictions;
+  std::vector<float> cpu_leaf_out_predictions;
+  cpu_predictor->PredictLeaf(dmat.get(), &cpu_leaf_out_predictions, model);
+  gpu_predictor->PredictLeaf(dmat.get(), &gpu_leaf_out_predictions, model);
+  for (int i = 0; i < gpu_leaf_out_predictions.size(); i++) {
+    ASSERT_EQ(gpu_leaf_out_predictions[i], cpu_leaf_out_predictions[i]);
+  }
+
+  // Test predict contribution
+  std::vector<float> gpu_out_contribution;
+  std::vector<float> cpu_out_contribution;
+  cpu_predictor->PredictContribution(dmat.get(), &cpu_out_contribution, model);
+  gpu_predictor->PredictContribution(dmat.get(), &gpu_out_contribution, model);
+  for (int i = 0; i < gpu_out_contribution.size(); i++) {
+    ASSERT_EQ(gpu_out_contribution[i], cpu_out_contribution[i]);
+  }
+}
+}  // namespace predictor
+}  // namespace xgboost
--- a/tests/cpp/xgboost_test.mk
+++ b/tests/cpp/xgboost_test.mk
@@ -5,17 +5,6 @@ UNITTEST=$(UTEST_ROOT)/xgboost_test
 UNITTEST_SRC=$(wildcard $(UTEST_ROOT)/*.cc $(UTEST_ROOT)/*/*.cc)
 UNITTEST_OBJ=$(patsubst $(UTEST_ROOT)%.cc, $(UTEST_OBJ_ROOT)%.o, $(UNITTEST_SRC))

-# for if and when we add cuda source files into xgboost core
-UNITTEST_CU_SRC=$(wildcard $(UTEST_ROOT)/*.cu $(UTEST_ROOT)/*/*.cu)
-UNITTEST_OBJ += $(patsubst $(UTEST_ROOT)%.cu, $(UTEST_OBJ_ROOT)%.o, $(UNITTEST_CU_SRC))
-
-# tests from grow_gpu plugin (only if CUDA path is enabled!)
-ifeq ($(PLUGIN_UPDATER_GPU),ON)
-  GPU_PLUGIN_FOLDER = plugin/updater_gpu
-  UNITTEST_CU_PLUGIN_SRC = $(wildcard $(GPU_PLUGIN_FOLDER)/test/cpp/*.cu)
-  UNITTEST_OBJ += $(patsubst %.cu, $(UTEST_OBJ_ROOT)/%.o, $(UNITTEST_CU_PLUGIN_SRC))
-endif
-
 GTEST_LIB=$(GTEST_PATH)/lib/
 GTEST_INC=$(GTEST_PATH)/include/

@@ -26,14 +15,6 @@ UNITTEST_DEPS=lib/libxgboost.a $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT)
 COVER_OBJ=$(patsubst %.o, %.gcda, $(ALL_OBJ)) $(patsubst %.o, %.gcda, $(UNITTEST_OBJ))

 # the order of the below targets matter!
-$(UTEST_OBJ_ROOT)/$(GPU_PLUGIN_FOLDER)/test/cpp/%.o: $(GPU_PLUGIN_FOLDER)/test/cpp/%.cu
-	@mkdir -p $(@D)
-	$(NVCC) $(NVCC_FLAGS) -I$(GTEST_INC) -o $@ -c $<
-
-$(UTEST_OBJ_ROOT)/%.o: $(UTEST_ROOT)/%.cu
-	@mkdir -p $(@D)
-	$(NVCC) $(NVCC_FLAGS) -I$(GTEST_INC) -o $@ -c $<
-
 $(UTEST_OBJ_ROOT)/$(GTEST_PATH)/%.o: $(GTEST_PATH)/%.cc
 	@mkdir -p $(@D)
 	$(CXX) $(UNITTEST_CFLAGS) -I$(GTEST_INC) -I$(GTEST_PATH) -o $@ -c $<
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -0,0 +1,37 @@
+from __future__ import print_function
+#pylint: skip-file
+import xgboost as xgb
+import testing as tm
+import numpy as np
+import unittest
+from nose.plugins.attrib import attr
+
+rng = np.random.RandomState(1994)
+
+@attr('gpu')
+class TestGPUPredict (unittest.TestCase):
+    def test_predict(self):
+        iterations = 1
+        np.random.seed(1)
+        test_num_rows = [10,1000,5000]
+        test_num_cols = [10,50,500]
+        for num_rows in test_num_rows:
+            for num_cols in test_num_cols:
+                dm = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows/2))
+                watchlist = [(dm, 'train')]
+                res = {}
+                param = {
+                    "objective":"binary:logistic",
+                    "predictor":"gpu_predictor",
+                    'eval_metric': 'auc',
+                }
+                bst = xgb.train(param, dm,iterations,evals=watchlist, evals_result=res)
+                assert self.non_decreasing(res["train"]["auc"])
+                gpu_pred = bst.predict(dm, output_margin=True)
+                bst.set_param({"predictor":"cpu_predictor"})
+                cpu_pred = bst.predict(dm, output_margin=True)
+                np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
+
+    def non_decreasing(self, L):
+        return all((x - y) < 0.001 for x, y in zip(L, L[1:]))
+
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -0,0 +1,325 @@
+from __future__ import print_function
+#pylint: skip-file
+import sys
+sys.path.append("../../tests/python")
+import xgboost as xgb
+import testing as tm
+import numpy as np
+import unittest
+from nose.plugins.attrib import attr
+
+rng = np.random.RandomState(1994)
+
+dpath = 'demo/data/'
+
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+    print(*args, file=sys.stdout, **kwargs)
+        
+@attr('gpu')
+class TestGPU(unittest.TestCase):
+    def test_grow_gpu(self):
+        tm._skip_if_no_sklearn()
+        from sklearn.datasets import load_digits
+        try:
+            from sklearn.model_selection import train_test_split
+        except:
+            from sklearn.cross_validation import train_test_split
+
+        ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+
+        ag_param = {'max_depth': 2,
+                    'tree_method': 'exact',
+                    'nthread': 0,
+                    'eta': 1,
+                    'silent': 1,
+                    'debug_verbose': 0,
+                    'objective': 'binary:logistic',
+                    'eval_metric': 'auc'}
+        ag_param2 = {'max_depth': 2,
+                     'tree_method': 'gpu_exact',
+                     'nthread': 0,
+                     'eta': 1,
+                     'silent': 1,
+                     'debug_verbose': 0,
+                     'objective': 'binary:logistic',
+                     'eval_metric': 'auc'}
+        ag_res = {}
+        ag_res2 = {}
+
+        num_rounds = 10
+        xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                  evals_result=ag_res)
+        xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                  evals_result=ag_res2)
+        assert ag_res['train']['auc'] == ag_res2['train']['auc']
+        assert ag_res['test']['auc'] == ag_res2['test']['auc']
+
+        digits = load_digits(2)
+        X = digits['data']
+        y = digits['target']
+        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+        dtrain = xgb.DMatrix(X_train, y_train)
+        dtest = xgb.DMatrix(X_test, y_test)
+
+        param = {'objective': 'binary:logistic',
+                 'nthread': 0,
+                 'tree_method': 'gpu_exact',
+                 'max_depth': 3,
+                 'debug_verbose': 0,
+                 'eval_metric': 'auc'}
+        res = {}
+        xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
+                  evals_result=res)
+        assert self.non_decreasing(res['train']['auc'])
+        assert self.non_decreasing(res['test']['auc'])
+
+        # fail-safe test for dense data
+        from sklearn.datasets import load_svmlight_file
+        X2, y2 = load_svmlight_file(dpath + 'agaricus.txt.train')
+        X2 = X2.toarray()
+        dtrain2 = xgb.DMatrix(X2, label=y2)
+
+        param = {'objective': 'binary:logistic',
+                 'nthread': 0,
+                 'tree_method': 'gpu_exact',
+                 'max_depth': 2,
+                 'debug_verbose': 0,
+                 'eval_metric': 'auc'}
+        res = {}
+        xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+
+        assert self.non_decreasing(res['train']['auc'])
+        assert res['train']['auc'][0] >= 0.85
+
+        for j in range(X2.shape[1]):
+            for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
+                X2[i, j] = 2
+
+        dtrain3 = xgb.DMatrix(X2, label=y2)
+        res = {}
+
+        xgb.train(param, dtrain3, num_rounds, [(dtrain3, 'train')], evals_result=res)
+
+        assert self.non_decreasing(res['train']['auc'])
+        assert res['train']['auc'][0] >= 0.85
+
+        for j in range(X2.shape[1]):
+            for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
+                X2[i, j] = 3
+
+        dtrain4 = xgb.DMatrix(X2, label=y2)
+        res = {}
+        xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
+        assert self.non_decreasing(res['train']['auc'])
+        assert res['train']['auc'][0] >= 0.85
+
+        
+    def test_grow_gpu_hist(self):
+        n_gpus=-1
+        tm._skip_if_no_sklearn()
+        from sklearn.datasets import load_digits
+        try:
+            from sklearn.model_selection import train_test_split
+        except:
+            from sklearn.cross_validation import train_test_split
+
+        ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+
+
+        for max_depth in range(3,10): # TODO: Doesn't work with 2 for some tests
+            #eprint("max_depth=%d" % (max_depth))
+            
+            for max_bin_i in range(3,11):
+                max_bin = np.power(2,max_bin_i)
+                #eprint("max_bin=%d" % (max_bin))
+
+                
+            
+                # regression test --- hist must be same as exact on all-categorial data
+                ag_param = {'max_depth': max_depth,
+                            'tree_method': 'exact',
+                            'nthread': 0,
+                            'eta': 1,
+                            'silent': 1,
+                            'debug_verbose': 0,
+                            'objective': 'binary:logistic',
+                            'eval_metric': 'auc'}
+                ag_param2 = {'max_depth': max_depth,
+                             'nthread': 0,
+                             'tree_method': 'gpu_hist',
+                             'eta': 1,
+                             'silent': 1,
+                             'debug_verbose': 0,
+                             'n_gpus': 1,
+                             'objective': 'binary:logistic',
+                             'max_bin': max_bin,
+                             'eval_metric': 'auc'}
+                ag_param3 = {'max_depth': max_depth,
+                             'nthread': 0,
+                             'tree_method': 'gpu_hist',
+                             'eta': 1,
+                             'silent': 1,
+                             'debug_verbose': 0,
+                             'n_gpus': n_gpus,
+                             'objective': 'binary:logistic',
+                             'max_bin': max_bin,
+                             'eval_metric': 'auc'}
+                ag_res = {}
+                ag_res2 = {}
+                ag_res3 = {}
+
+                num_rounds = 10
+                #eprint("normal updater");
+                xgb.train(ag_param, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                          evals_result=ag_res)
+                #eprint("grow_gpu_hist updater 1 gpu");
+                xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                          evals_result=ag_res2)
+                #eprint("grow_gpu_hist updater %d gpus" % (n_gpus));
+                xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train'), (ag_dtest, 'test')],
+                          evals_result=ag_res3)
+                #        assert 1==0
+                assert ag_res['train']['auc'] == ag_res2['train']['auc']
+                assert ag_res['test']['auc'] == ag_res2['test']['auc']
+                assert ag_res['test']['auc'] == ag_res3['test']['auc']
+
+                ######################################################################
+                digits = load_digits(2)
+                X = digits['data']
+                y = digits['target']
+                X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+                dtrain = xgb.DMatrix(X_train, y_train)
+                dtest = xgb.DMatrix(X_test, y_test)
+
+                param = {'objective': 'binary:logistic',
+                         'tree_method': 'gpu_hist',
+                         'nthread': 0,
+                         'max_depth': max_depth,
+                         'n_gpus': 1,
+                         'max_bin': max_bin,
+                         'debug_verbose': 0,
+                         'eval_metric': 'auc'}
+                res = {}
+                #eprint("digits: grow_gpu_hist updater 1 gpu");
+                xgb.train(param, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
+                          evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                #assert self.non_decreasing(res['test']['auc'])
+                param2 = {'objective': 'binary:logistic',
+                          'nthread': 0,
+                          'tree_method': 'gpu_hist',
+                          'max_depth': max_depth,
+                          'n_gpus': n_gpus,
+                          'max_bin': max_bin,
+                          'debug_verbose': 0,
+                          'eval_metric': 'auc'}
+                res2 = {}
+                #eprint("digits: grow_gpu_hist updater %d gpus" % (n_gpus));
+                xgb.train(param2, dtrain, num_rounds, [(dtrain, 'train'), (dtest, 'test')],
+                          evals_result=res2)
+                assert self.non_decreasing(res2['train']['auc'])
+                #assert self.non_decreasing(res2['test']['auc'])
+                assert res['train']['auc'] == res2['train']['auc']
+                #assert res['test']['auc'] == res2['test']['auc']
+
+                ######################################################################
+                # fail-safe test for dense data
+                from sklearn.datasets import load_svmlight_file
+                X2, y2 = load_svmlight_file(dpath + 'agaricus.txt.train')
+                X2 = X2.toarray()
+                dtrain2 = xgb.DMatrix(X2, label=y2)
+
+                param = {'objective': 'binary:logistic',
+                         'nthread': 0,
+                         'tree_method': 'gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'max_bin': max_bin,
+                         'debug_verbose': 0,
+                         'eval_metric': 'auc'}
+                res = {}
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+
+                for j in range(X2.shape[1]):
+                    for i in rng.choice(X2.shape[0], size=num_rounds, replace=False):
+                        X2[i, j] = 2
+
+                dtrain3 = xgb.DMatrix(X2, label=y2)
+                res = {}
+
+                xgb.train(param, dtrain3, num_rounds, [(dtrain3, 'train')], evals_result=res)
+
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+
+                for j in range(X2.shape[1]):
+                    for i in np.random.choice(X2.shape[0], size=num_rounds, replace=False):
+                        X2[i, j] = 3
+
+                dtrain4 = xgb.DMatrix(X2, label=y2)
+                res = {}
+                xgb.train(param, dtrain4, num_rounds, [(dtrain4, 'train')], evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+
+                ######################################################################
+                # fail-safe test for max_bin
+                param = {'objective': 'binary:logistic',
+                         'nthread': 0,
+                         'tree_method': 'gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'debug_verbose': 0,
+                         'eval_metric': 'auc',
+                         'max_bin': max_bin}
+                res = {}
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+                ######################################################################
+                # subsampling
+                param = {'objective': 'binary:logistic',
+                         'nthread': 0,
+                         'tree_method': 'gpu_hist',
+                         'max_depth': max_depth,
+                         'n_gpus': n_gpus,
+                         'eval_metric': 'auc',
+                         'colsample_bytree': 0.5,
+                         'colsample_bylevel': 0.5,
+                         'subsample': 0.5,
+                         'debug_verbose': 0,
+                         'max_bin': max_bin}
+                res = {}
+                xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+                assert self.non_decreasing(res['train']['auc'])
+                if max_bin>32:
+                    assert res['train']['auc'][0] >= 0.85
+        ######################################################################
+        # fail-safe test for max_bin=2
+        param = {'objective': 'binary:logistic',
+                 'nthread': 0,
+                 'tree_method': 'gpu_hist',
+                 'max_depth': 2,
+                 'n_gpus': n_gpus,
+                 'debug_verbose': 0,
+                 'eval_metric': 'auc',
+                 'max_bin': 2}
+        res = {}
+        xgb.train(param, dtrain2, num_rounds, [(dtrain2, 'train')], evals_result=res)
+        assert self.non_decreasing(res['train']['auc'])
+        if max_bin>32:
+            assert res['train']['auc'][0] >= 0.85
+        
+        
+    def non_decreasing(self, L):
+            return all((x - y) < 0.001 for x, y in zip(L, L[1:]))
--- a/tests/python-gpu/test_large_sizes.py
+++ b/tests/python-gpu/test_large_sizes.py
@@ -0,0 +1,112 @@
+from __future__ import print_function
+#pylint: skip-file
+import sys
+import time
+sys.path.append("../../tests/python")
+import xgboost as xgb
+import testing as tm
+import numpy as np
+import unittest
+from nose.plugins.attrib import attr
+
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs) ; sys.stderr.flush()
+    print(*args, file=sys.stdout, **kwargs) ; sys.stdout.flush()
+
+rng = np.random.RandomState(1994)
+
+# "realistic" size based upon http://stat-computing.org/dataexpo/2009/ , which has been processed to one-hot encode categoricalsxsy
+cols = 31
+# reduced to fit onto 1 gpu but still be large
+rows3 = 5000 # small
+rows2 = 4360032 # medium
+rows1 = 42360032 # large
+#rows1 = 152360032 # can do this for multi-gpu test (very large)
+rowslist = [rows1, rows2, rows3]
+
+
+@attr('slow')
+class TestGPU(unittest.TestCase):
+    def test_large(self):
+        eprint("Starting test for large data")
+        tm._skip_if_no_sklearn()
+
+        for rows in rowslist:
+            
+            eprint("Creating train data rows=%d cols=%d" % (rows,cols))
+            tmp = time.time()
+            np.random.seed(7)
+            X = np.random.rand(rows, cols)
+            y = np.random.rand(rows)
+            print("Time to Create Data: %r" % (time.time() - tmp))
+
+            eprint("Starting DMatrix(X,y)")
+            tmp = time.time()
+            ag_dtrain = xgb.DMatrix(X,y,nthread=40)
+            print("Time to DMatrix: %r" % (time.time() - tmp))
+
+            max_depth=6
+            max_bin=1024
+                
+            # regression test --- hist must be same as exact on all-categorial data
+            ag_param = {'max_depth': max_depth,
+                        'tree_method': 'exact',
+                        'nthread': 0,
+                        'eta': 1,
+                        'silent': 0,
+                        'debug_verbose': 5,
+                        'objective': 'binary:logistic',
+                        'eval_metric': 'auc'}
+            ag_paramb = {'max_depth': max_depth,
+                        'tree_method': 'hist',
+                        'nthread': 0,
+                        'eta': 1,
+                        'silent': 0,
+                        'debug_verbose': 5,
+                        'objective': 'binary:logistic',
+                        'eval_metric': 'auc'}
+            ag_param2 = {'max_depth': max_depth,
+                        'tree_method': 'gpu_hist',
+                        'nthread': 0,
+                        'eta': 1,
+                        'silent': 0,
+                        'debug_verbose': 5,
+                        'n_gpus': 1,
+                        'objective': 'binary:logistic',
+                        'max_bin': max_bin,
+                        'eval_metric': 'auc'}
+            ag_param3 = {'max_depth': max_depth,
+                         'tree_method': 'gpu_hist',
+                         'nthread': 0,
+                         'eta': 1,
+                         'silent': 0,
+                         'debug_verbose': 5,
+                         'n_gpus': -1,
+                         'objective': 'binary:logistic',
+                         'max_bin': max_bin,
+                         'eval_metric': 'auc'}
+            ag_res = {}
+            ag_resb = {}
+            ag_res2 = {}
+            ag_res3 = {}
+
+            num_rounds = 1
+            tmp = time.time()
+            #eprint("hist updater")
+            #xgb.train(ag_paramb, ag_dtrain, num_rounds, [(ag_dtrain, 'train')],
+            #          evals_result=ag_resb)
+            #print("Time to Train: %s seconds" % (str(time.time() - tmp)))
+
+            tmp = time.time()
+            eprint("gpu_hist updater 1 gpu")
+            xgb.train(ag_param2, ag_dtrain, num_rounds, [(ag_dtrain, 'train')],
+                      evals_result=ag_res2)
+            print("Time to Train: %s seconds" % (str(time.time() - tmp)))
+
+            tmp = time.time()
+            eprint("gpu_hist updater all gpus")
+            xgb.train(ag_param3, ag_dtrain, num_rounds, [(ag_dtrain, 'train')],
+                      evals_result=ag_res3)
+            print("Time to Train: %s seconds" % (str(time.time() - tmp)))
+
+