Rewrite approx (#7214)

This PR rewrites the approx tree method to use codebase from hist for better performance and code sharing. The rewrite has many benefits: - Support for both `max_leaves` and `max_depth`. - Support for `grow_policy`. - Support for mono constraint. - Support for feature weights. - Support for easier bin configuration (`max_bin`). - Support for categorical data. - Faster performance for most of the datasets. (many times faster) - Support for prediction cache. - Significantly better performance for external memory. - Unites the code base between approx and hist.
2022-01-10 21:15:05 +08:00
parent ed95e77752
commit 001503186c
22 changed files with 635 additions and 264 deletions
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -7,6 +7,8 @@ from hypothesis import given, strategies, assume, settings, note

 sys.path.append("tests/python")
 import testing as tm
+import test_updaters as test_up
+

 parameter_strategy = strategies.fixed_dictionaries({
    'max_depth': strategies.integers(0, 11),
@@ -32,6 +34,8 @@ def train_result(param, dmat, num_rounds):


 class TestGPUUpdaters:
+    cputest = test_up.TestTreeMethod()
+
    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
@@ -41,51 +45,12 @@ class TestGPUUpdaters:
        note(result)
        assert tm.non_increasing(result["train"][dataset.metric])

-    def run_categorical_basic(self, rows, cols, rounds, cats):
-        onehot, label = tm.make_categorical(rows, cols, cats, True)
-        cat, _ = tm.make_categorical(rows, cols, cats, False)
-
-        by_etl_results = {}
-        by_builtin_results = {}
-
-        parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}
-
-        m = xgb.DMatrix(onehot, label, enable_categorical=False)
-        xgb.train(
-            parameters,
-            m,
-            num_boost_round=rounds,
-            evals=[(m, "Train")],
-            evals_result=by_etl_results,
-        )
-
-        m = xgb.DMatrix(cat, label, enable_categorical=True)
-        xgb.train(
-            parameters,
-            m,
-            num_boost_round=rounds,
-            evals=[(m, "Train")],
-            evals_result=by_builtin_results,
-        )
-
-        # There are guidelines on how to specify tolerance based on considering output as
-        # random variables. But in here the tree construction is extremely sensitive to
-        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
-        # different tree.  So even though the test is quite lenient, hypothesis can still
-        # pick up falsifying examples from time to time.
-        np.testing.assert_allclose(
-            np.array(by_etl_results["Train"]["rmse"]),
-            np.array(by_builtin_results["Train"]["rmse"]),
-            rtol=1e-3,
-        )
-        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
-
    @given(strategies.integers(10, 400), strategies.integers(3, 8),
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):
-        self.run_categorical_basic(rows, cols, rounds, cats)
+        self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")

    def test_categorical_32_cat(self):
        '''32 hits the bound of integer bitset, so special test'''
@@ -93,7 +58,7 @@ class TestGPUUpdaters:
        cols = 10
        cats = 32
        rounds = 4
-        self.run_categorical_basic(rows, cols, rounds, cats)
+        self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")

    def test_invalid_categorical(self):
        import cupy as cp