Rewrite approx (#7214)

This PR rewrites the approx tree method to use codebase from hist for better performance and code sharing. The rewrite has many benefits: - Support for both `max_leaves` and `max_depth`. - Support for `grow_policy`. - Support for mono constraint. - Support for feature weights. - Support for easier bin configuration (`max_bin`). - Support for categorical data. - Faster performance for most of the datasets. (many times faster) - Support for prediction cache. - Significantly better performance for external memory. - Unites the code base between approx and hist.
2022-01-10 21:15:05 +08:00
parent ed95e77752
commit 001503186c
22 changed files with 635 additions and 264 deletions
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -45,14 +45,20 @@ class TestTreeMethod:
        result = train_result(param, dataset.get_dmat(), num_rounds)
        assert tm.non_increasing(result['train'][dataset.metric])

-    @given(exact_parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.dataset_strategy,
+    )
    @settings(deadline=None)
-    def test_approx(self, param, num_rounds, dataset):
-        param['tree_method'] = 'approx'
+    def test_approx(self, param, hist_param, num_rounds, dataset):
+        param["tree_method"] = "approx"
        param = dataset.set_params(param)
+        param.update(hist_param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
-        assert tm.non_increasing(result['train'][dataset.metric], 1e-3)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_pruner(self):
@@ -126,3 +132,53 @@ class TestTreeMethod:
        y = [1000000., 0., 0., 500000.]
        w = [0, 0, 1, 0]
        model.fit(X, y, sample_weight=w)
+
+    def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
+        onehot, label = tm.make_categorical(rows, cols, cats, True)
+        cat, _ = tm.make_categorical(rows, cols, cats, False)
+
+        by_etl_results = {}
+        by_builtin_results = {}
+
+        predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
+        # Use one-hot exclusively
+        parameters = {
+            "tree_method": tree_method, "predictor": predictor, "max_cat_to_onehot": 9999
+        }
+
+        m = xgb.DMatrix(onehot, label, enable_categorical=False)
+        xgb.train(
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+            evals_result=by_etl_results,
+        )
+
+        m = xgb.DMatrix(cat, label, enable_categorical=True)
+        xgb.train(
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+            evals_result=by_builtin_results,
+        )
+
+        # There are guidelines on how to specify tolerance based on considering output as
+        # random variables. But in here the tree construction is extremely sensitive to
+        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
+        # different tree.  So even though the test is quite lenient, hypothesis can still
+        # pick up falsifying examples from time to time.
+        np.testing.assert_allclose(
+            np.array(by_etl_results["Train"]["rmse"]),
+            np.array(by_builtin_results["Train"]["rmse"]),
+            rtol=1e-3,
+        )
+        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
+
+    @given(strategies.integers(10, 400), strategies.integers(3, 8),
+           strategies.integers(1, 2), strategies.integers(4, 7))
+    @settings(deadline=None)
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_categorical(self, rows, cols, rounds, cats):
+        self.run_categorical_basic(rows, cols, rounds, cats, "approx")