Added finding quantiles on GPU. (#3393)

* Added finding quantiles on GPU. - this includes datasets where weights are assigned to data rows - as the quantiles found by the new algorithm are not the same as those found by the old one, test thresholds in tests/python-gpu/test_gpu_updaters.py have been adjusted. * Adjustments and improved testing for finding quantiles on the GPU. - added C++ tests for the DeviceSketch() function - reduced one of the thresholds in test_gpu_updaters.py - adjusted the cuts found by the find_cuts_k kernel
2018-07-27 04:03:16 +02:00
parent e2f09db77a
commit cc6a5a3666
14 changed files with 691 additions and 116 deletions
--- a/tests/python/regression_test_utilities.py
+++ b/tests/python/regression_test_utilities.py
@@ -15,11 +15,16 @@ except ImportError:


 class Dataset:
-    def __init__(self, name, get_dataset, objective, metric, use_external_memory=False):
+    def __init__(self, name, get_dataset, objective, metric,
+                 has_weights=False, use_external_memory=False):
        self.name = name
        self.objective = objective
        self.metric = metric
-        self.X, self.y = get_dataset()
+        if has_weights:
+            self.X, self.y, self.w = get_dataset()
+        else:
+            self.X, self.y = get_dataset()
+            self.w = None
        self.use_external_memory = use_external_memory


@@ -49,6 +54,16 @@ def get_sparse():
    return X, y


+def get_sparse_weights():
+    rng = np.random.RandomState(199)
+    n = 10000
+    sparsity = 0.25
+    X, y = datasets.make_regression(n, random_state=rng)
+    X = np.array([[np.nan if rng.uniform(0, 1) < sparsity else x for x in x_row] for x_row in X])
+    w = np.array([rng.uniform(1, 10) for i in range(n)])
+    return X, y, w
+
+
 def train_dataset(dataset, param_in, num_rounds=10, scale_features=False):
    param = param_in.copy()
    param["objective"] = dataset.objective
@@ -64,9 +79,10 @@ def train_dataset(dataset, param_in, num_rounds=10, scale_features=False):
    if dataset.use_external_memory:
        np.savetxt('tmptmp_1234.csv', np.hstack((dataset.y.reshape(len(dataset.y), 1), X)),
                   delimiter=',')
-        dtrain = xgb.DMatrix('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_')
+        dtrain = xgb.DMatrix('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
+                             weight=dataset.w)
    else:
-        dtrain = xgb.DMatrix(X, dataset.y)
+        dtrain = xgb.DMatrix(X, dataset.y, weight=dataset.w)

    print("Training on dataset: " + dataset.name, file=sys.stderr)
    print("Using parameters: " + str(param), file=sys.stderr)
@@ -112,6 +128,8 @@ def run_suite(param, num_rounds=10, select_datasets=None, scale_features=False):
        Dataset("Digits", get_digits, "multi:softmax", "merror"),
        Dataset("Cancer", get_cancer, "binary:logistic", "error"),
        Dataset("Sparse regression", get_sparse, "reg:linear", "rmse"),
+        Dataset("Sparse regression with weights", get_sparse_weights,
+                "reg:linear", "rmse", has_weights=True),
        Dataset("Boston External Memory", get_boston, "reg:linear", "rmse",
                use_external_memory=True)
    ]
--- a/tests/python/test_linear.py
+++ b/tests/python/test_linear.py
@@ -52,6 +52,10 @@ def assert_classification_result(results):


 class TestLinear(unittest.TestCase):
+
+    datasets = ["Boston", "Digits", "Cancer", "Sparse regression",
+                "Boston External Memory"]
+
    def test_coordinate(self):
        tm._skip_if_no_sklearn()
        variable_param = {'booster': ['gblinear'], 'updater': ['coord_descent'], 'eta': [0.5],
@@ -60,7 +64,7 @@ class TestLinear(unittest.TestCase):
                          'feature_selector': ['cyclic', 'shuffle', 'greedy', 'thrifty']
                          }
        for param in parameter_combinations(variable_param):
-            results = run_suite(param, 200, None, scale_features=True)
+            results = run_suite(param, 200, self.datasets, scale_features=True)
            assert_regression_result(results, 1e-2)
            assert_classification_result(results)

@@ -72,6 +76,6 @@ class TestLinear(unittest.TestCase):
                          'feature_selector': ['cyclic', 'shuffle']
                          }
        for param in parameter_combinations(variable_param):
-            results = run_suite(param, 200, None, True)
+            results = run_suite(param, 200, self.datasets, True)
            assert_regression_result(results, 1e-2)
            assert_classification_result(results)