From f2209c1fe495235ea2ab747ca9385e3468588306 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 28 Nov 2022 20:28:06 +0800 Subject: [PATCH] Don't shuffle columns in categorical tests. (#8446) --- python-package/xgboost/testing/__init__.py | 7 +++- tests/python/test_demos.py | 1 + tests/python/test_updaters.py | 37 ++++++++++++++-------- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 5079ce08d..7c64f499a 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -486,6 +486,7 @@ def make_categorical( onehot: bool, sparsity: float = 0.0, cat_ratio: float = 1.0, + shuffle: bool = False, ) -> Tuple[ArrayLike, np.ndarray]: """Generate categorical features for test. @@ -499,6 +500,8 @@ def make_categorical( The ratio of the amount of missing values over the number of all entries. cat_ratio: The ratio of features that are categorical. + shuffle: + Whether we should shuffle the columns. Returns ------- @@ -538,10 +541,12 @@ def make_categorical( if onehot: df = pd.get_dummies(df) + + if shuffle: columns = list(df.columns) rng.shuffle(columns) df = df[columns] - return pd.get_dummies(df), label + return df, label diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index 8a987492d..35570ba4d 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -170,6 +170,7 @@ def test_json_model() -> None: onehot=False, sparsity=0.5, cat_ratio=0.5, + shuffle=True, ) reg = xgboost.XGBRegressor( n_estimators=2, tree_method="hist", enable_categorical=True diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index 1ed43588e..1682d90a1 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -1,6 +1,6 @@ import json from string import ascii_lowercase -from typing import Any, Dict +from typing import Any, Dict, List import numpy as np import pytest @@ -238,17 +238,24 @@ class TestTreeMethod: # Test with partition-based split run(self.USE_PART) - def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method): + def run_categorical_ohe( + self, rows: int, cols: int, rounds: int, cats: int, tree_method: str + ) -> None: onehot, label = tm.make_categorical(rows, cols, cats, True) + print(onehot.columns) cat, _ = tm.make_categorical(rows, cols, cats, False) + print(cat.columns) - by_etl_results = {} - by_builtin_results = {} + by_etl_results: Dict[str, Dict[str, List[float]]] = {} + by_builtin_results: Dict[str, Dict[str, List[float]]] = {} predictor = "gpu_predictor" if tree_method == "gpu_hist" else None - parameters = {"tree_method": tree_method, "predictor": predictor} - # Use one-hot exclusively - parameters["max_cat_to_onehot"] = self.USE_ONEHOT + parameters: Dict[str, Any] = { + "tree_method": tree_method, + "predictor": predictor, + # Use one-hot exclusively + "max_cat_to_onehot": self.USE_ONEHOT + } m = xgb.DMatrix(onehot, label, enable_categorical=False) xgb.train( @@ -268,11 +275,11 @@ class TestTreeMethod: evals_result=by_builtin_results, ) - # There are guidelines on how to specify tolerance based on considering output as - # random variables. But in here the tree construction is extremely sensitive to - # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely - # different tree. So even though the test is quite lenient, hypothesis can still - # pick up falsifying examples from time to time. + # There are guidelines on how to specify tolerance based on considering output + # as random variables. But in here the tree construction is extremely sensitive + # to floating point errors. An 1e-5 error in a histogram bin can lead to an + # entirely different tree. So even though the test is quite lenient, hypothesis + # can still pick up falsifying examples from time to time. np.testing.assert_allclose( np.array(by_etl_results["Train"]["rmse"]), np.array(by_builtin_results["Train"]["rmse"]), @@ -280,7 +287,7 @@ class TestTreeMethod: ) assert tm.non_increasing(by_builtin_results["Train"]["rmse"]) - by_grouping: xgb.callback.TrainingCallback.EvalsLog = {} + by_grouping: Dict[str, Dict[str, List[float]]] = {} # switch to partition-based splits parameters["max_cat_to_onehot"] = self.USE_PART parameters["reg_lambda"] = 0 @@ -313,7 +320,9 @@ class TestTreeMethod: strategies.integers(1, 2), strategies.integers(4, 7)) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) - def test_categorical_ohe(self, rows, cols, rounds, cats): + def test_categorical_ohe( + self, rows: int, cols: int, rounds: int, cats: int + ) -> None: self.run_categorical_ohe(rows, cols, rounds, cats, "approx") self.run_categorical_ohe(rows, cols, rounds, cats, "hist")