Implement max_cat_threshold for CPU. (#7957)

2022-06-04 11:02:46 +08:00
parent 78694405a6
commit b90c6d25e8
8 changed files with 177 additions and 20 deletions
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -74,8 +74,8 @@ class TestGPUUpdaters:
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None, print_blob=True)
    @pytest.mark.skipif(**tm.no_pandas())
-    def test_categorical(self, rows, cols, rounds, cats):
-        self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")
+    def test_categorical_ohe(self, rows, cols, rounds, cats):
+        self.cputest.run_categorical_ohe(rows, cols, rounds, cats, "gpu_hist")

    @given(
        strategies.integers(10, 400),
@@ -96,7 +96,7 @@ class TestGPUUpdaters:
        cols = 10
        cats = 32
        rounds = 4
-        self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")
+        self.cputest.run_categorical_ohe(rows, cols, rounds, cats, "gpu_hist")

    @pytest.mark.skipif(**tm.no_cupy())
    def test_invalid_category(self):
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -31,6 +31,14 @@ hist_parameter_strategy = strategies.fixed_dictionaries({
    x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))


+cat_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_cat_to_onehot": strategies.integers(1, 128),
+        "max_cat_threshold": strategies.integers(1, 128),
+    }
+)
+
+
 def train_result(param, dmat, num_rounds):
    result = {}
    xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
@@ -253,7 +261,7 @@ class TestTreeMethod:
        # Test with partition-based split
        run(self.USE_PART)

-    def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
+    def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method):
        onehot, label = tm.make_categorical(rows, cols, cats, True)
        cat, _ = tm.make_categorical(rows, cols, cats, False)

@@ -328,9 +336,55 @@ class TestTreeMethod:
           strategies.integers(1, 2), strategies.integers(4, 7))
    @settings(deadline=None, print_blob=True)
    @pytest.mark.skipif(**tm.no_pandas())
-    def test_categorical(self, rows, cols, rounds, cats):
-        self.run_categorical_basic(rows, cols, rounds, cats, "approx")
-        self.run_categorical_basic(rows, cols, rounds, cats, "hist")
+    def test_categorical_ohe(self, rows, cols, rounds, cats):
+        self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
+        self.run_categorical_ohe(rows, cols, rounds, cats, "hist")
+
+    @given(
+        tm.categorical_dataset_strategy,
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        cat_parameter_strategy,
+        strategies.integers(4, 32),
+        strategies.sampled_from(["hist", "approx"]),
+    )
+    @settings(deadline=None, print_blob=True)
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_categorical(
+        self,
+        dataset: tm.TestDataset,
+        exact_parameters: Dict[str, Any],
+        hist_parameters: Dict[str, Any],
+        cat_parameters: Dict[str, Any],
+        n_rounds: int,
+        tree_method: str,
+    ) -> None:
+        cat_parameters.update(exact_parameters)
+        cat_parameters.update(hist_parameters)
+        cat_parameters["tree_method"] = tree_method
+
+        results = train_result(cat_parameters, dataset.get_dmat(), n_rounds)
+        tm.non_increasing(results["train"]["rmse"])
+
+    @given(
+        hist_parameter_strategy,
+        cat_parameter_strategy,
+        strategies.sampled_from(["hist", "approx"]),
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_categorical_ames_housing(
+        self,
+        hist_parameters: Dict[str, Any],
+        cat_parameters: Dict[str, Any],
+        tree_method: str,
+    ) -> None:
+        cat_parameters.update(hist_parameters)
+        dataset = tm.TestDataset(
+            "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
+        )
+        cat_parameters["tree_method"] = tree_method
+        results = train_result(cat_parameters, dataset.get_dmat(), 16)
+        tm.non_increasing(results["train"]["rmse"])

    @given(
        strategies.integers(10, 400),
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -214,7 +214,9 @@ class TestDataset:
        return params_in

    def get_dmat(self):
-        return xgb.DMatrix(self.X, self.y, self.w, base_margin=self.margin)
+        return xgb.DMatrix(
+            self.X, self.y, self.w, base_margin=self.margin, enable_categorical=True
+        )

    def get_device_dmat(self):
        w = None if self.w is None else cp.array(self.w)
@@ -277,6 +279,48 @@ def get_sparse():
    return X, y


+@memory.cache
+def get_ames_housing():
+    """
+    Number of samples: 1460
+    Number of features: 20
+    Number of categorical features: 10
+    Number of numerical features: 10
+    """
+    from sklearn.datasets import fetch_openml
+    X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+
+    categorical_columns_subset: list[str] = [
+        "BldgType",             # 5 cats, no nan
+        "GarageFinish",         # 3 cats, nan
+        "LotConfig",            # 5 cats, no nan
+        "Functional",           # 7 cats, no nan
+        "MasVnrType",           # 4 cats, nan
+        "HouseStyle",           # 8 cats, no nan
+        "FireplaceQu",          # 5 cats, nan
+        "ExterCond",            # 5 cats, no nan
+        "ExterQual",            # 4 cats, no nan
+        "PoolQC",               # 3 cats, nan
+    ]
+
+    numerical_columns_subset: list[str] = [
+        "3SsnPorch",
+        "Fireplaces",
+        "BsmtHalfBath",
+        "HalfBath",
+        "GarageCars",
+        "TotRmsAbvGrd",
+        "BsmtFinSF1",
+        "BsmtFinSF2",
+        "GrLivArea",
+        "ScreenPorch",
+    ]
+
+    X = X[categorical_columns_subset + numerical_columns_subset]
+    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
+    return X, y
+
+
@memory.cache
 def get_mq2008(dpath):
    from sklearn.datasets import load_svmlight_files
@@ -329,7 +373,6 @@ def make_categorical(
        for i in range(n_features):
            index = rng.randint(low=0, high=n_samples-1, size=int(n_samples * sparsity))
            df.iloc[index, i] = np.NaN
-            assert df.iloc[:, i].isnull().values.any()
            assert n_categories == np.unique(df.dtypes[i].categories).size

    if onehot:
@@ -337,6 +380,41 @@ def make_categorical(
    return df, label


+def _cat_sampled_from():
+    @strategies.composite
+    def _make_cat(draw):
+        n_samples = draw(strategies.integers(2, 512))
+        n_features = draw(strategies.integers(1, 4))
+        n_cats = draw(strategies.integers(1, 128))
+        sparsity = draw(
+            strategies.floats(
+                min_value=0,
+                max_value=1,
+                allow_nan=False,
+                allow_infinity=False,
+                allow_subnormal=False,
+            )
+        )
+        return n_samples, n_features, n_cats, sparsity
+
+    def _build(args):
+        n_samples = args[0]
+        n_features = args[1]
+        n_cats = args[2]
+        sparsity = args[3]
+        return TestDataset(
+            f"{n_samples}x{n_features}-{n_cats}-{sparsity}",
+            lambda: make_categorical(n_samples, n_features, n_cats, False, sparsity),
+            "reg:squarederror",
+            "rmse",
+        )
+
+    return _make_cat().map(_build)
+
+
+categorical_dataset_strategy = _cat_sampled_from()
+
+
@memory.cache
 def make_sparse_regression(
    n_samples: int, n_features: int, sparsity: float, as_dense: bool