Handle categorical split in model histogram and dataframe. (#7065)

* Error on get_split_value_histogram when feature is categorical * Add a category column to output dataframe
2021-07-02 13:10:36 +08:00
parent 1cd20efe68
commit a5d222fcdb
3 changed files with 96 additions and 23 deletions
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -32,15 +32,14 @@ def train_result(param, dmat, num_rounds):


 class TestGPUUpdaters:
-    @given(parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
    @settings(deadline=None)
    def test_gpu_hist(self, param, num_rounds, dataset):
-        param['tree_method'] = 'gpu_hist'
+        param["tree_method"] = "gpu_hist"
        param = dataset.set_params(param)
        result = train_result(param, dataset.get_dmat(), num_rounds)
        note(result)
-        assert tm.non_increasing(result['train'][dataset.metric])
+        assert tm.non_increasing(result["train"][dataset.metric])

    def run_categorical_basic(self, rows, cols, rounds, cats):
        onehot, label = tm.make_categorical(rows, cols, cats, True)
@@ -49,25 +48,40 @@ class TestGPUUpdaters:
        by_etl_results = {}
        by_builtin_results = {}

-        parameters = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
+        parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}

-        m = xgb.DMatrix(onehot, label, enable_categorical=True)
-        xgb.train(parameters, m,
-                  num_boost_round=rounds,
-                  evals=[(m, 'Train')], evals_result=by_etl_results)
+        m = xgb.DMatrix(onehot, label, enable_categorical=False)
+        xgb.train(
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+            evals_result=by_etl_results,
+        )

        m = xgb.DMatrix(cat, label, enable_categorical=True)
-        xgb.train(parameters, m,
-                  num_boost_round=rounds,
-                  evals=[(m, 'Train')], evals_result=by_builtin_results)
+        xgb.train(
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+            evals_result=by_builtin_results,
+        )
+
+        # There are guidelines on how to specify tolerance based on considering output as
+        # random variables. But in here the tree construction is extremely sensitive to
+        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
+        # different tree.  So even though the test is quite lenient, hypothesis can still
+        # pick up falsifying examples from time to time.
        np.testing.assert_allclose(
-            np.array(by_etl_results['Train']['rmse']),
-            np.array(by_builtin_results['Train']['rmse']),
-            rtol=1e-3)
-        assert tm.non_increasing(by_builtin_results['Train']['rmse'])
+            np.array(by_etl_results["Train"]["rmse"]),
+            np.array(by_builtin_results["Train"]["rmse"]),
+            rtol=1e-3,
+        )
+        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

    @given(strategies.integers(10, 400), strategies.integers(3, 8),
-           strategies.integers(1, 5), strategies.integers(4, 7))
+           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None)
    @pytest.mark.skipif(**tm.no_pandas())
    def test_categorical(self, rows, cols, rounds, cats):