From f2209c1fe495235ea2ab747ca9385e3468588306 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 28 Nov 2022 20:28:06 +0800
Subject: [PATCH] Don't shuffle columns in categorical tests. (#8446)

---
 python-package/xgboost/testing/__init__.py |  7 +++-
 tests/python/test_demos.py                 |  1 +
 tests/python/test_updaters.py              | 37 ++++++++++++++--------
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 5079ce08d..7c64f499a 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -486,6 +486,7 @@ def make_categorical(
     onehot: bool,
     sparsity: float = 0.0,
     cat_ratio: float = 1.0,
+    shuffle: bool = False,
 ) -> Tuple[ArrayLike, np.ndarray]:
     """Generate categorical features for test.
 
@@ -499,6 +500,8 @@ def make_categorical(
         The ratio of the amount of missing values over the number of all entries.
     cat_ratio:
         The ratio of features that are categorical.
+    shuffle:
+        Whether we should shuffle the columns.
 
     Returns
     -------
@@ -538,10 +541,12 @@ def make_categorical(
 
     if onehot:
         df = pd.get_dummies(df)
+
+    if shuffle:
         columns = list(df.columns)
         rng.shuffle(columns)
         df = df[columns]
-        return pd.get_dummies(df), label
+
     return df, label
 
 
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index 8a987492d..35570ba4d 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -170,6 +170,7 @@ def test_json_model() -> None:
         onehot=False,
         sparsity=0.5,
         cat_ratio=0.5,
+        shuffle=True,
     )
     reg = xgboost.XGBRegressor(
         n_estimators=2, tree_method="hist", enable_categorical=True
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 1ed43588e..1682d90a1 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -1,6 +1,6 @@
 import json
 from string import ascii_lowercase
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import numpy as np
 import pytest
@@ -238,17 +238,24 @@ class TestTreeMethod:
         # Test with partition-based split
         run(self.USE_PART)
 
-    def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method):
+    def run_categorical_ohe(
+        self, rows: int, cols: int, rounds: int, cats: int, tree_method: str
+    ) -> None:
         onehot, label = tm.make_categorical(rows, cols, cats, True)
+        print(onehot.columns)
         cat, _ = tm.make_categorical(rows, cols, cats, False)
+        print(cat.columns)
 
-        by_etl_results = {}
-        by_builtin_results = {}
+        by_etl_results: Dict[str, Dict[str, List[float]]] = {}
+        by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
 
         predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
-        parameters = {"tree_method": tree_method, "predictor": predictor}
-        # Use one-hot exclusively
-        parameters["max_cat_to_onehot"] = self.USE_ONEHOT
+        parameters: Dict[str, Any] = {
+            "tree_method": tree_method,
+            "predictor": predictor,
+            # Use one-hot exclusively
+            "max_cat_to_onehot": self.USE_ONEHOT
+        }
 
         m = xgb.DMatrix(onehot, label, enable_categorical=False)
         xgb.train(
@@ -268,11 +275,11 @@ class TestTreeMethod:
             evals_result=by_builtin_results,
         )
 
-        # There are guidelines on how to specify tolerance based on considering output as
-        # random variables. But in here the tree construction is extremely sensitive to
-        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
-        # different tree.  So even though the test is quite lenient, hypothesis can still
-        # pick up falsifying examples from time to time.
+        # There are guidelines on how to specify tolerance based on considering output
+        # as random variables. But in here the tree construction is extremely sensitive
+        # to floating point errors. An 1e-5 error in a histogram bin can lead to an
+        # entirely different tree. So even though the test is quite lenient, hypothesis
+        # can still pick up falsifying examples from time to time.
         np.testing.assert_allclose(
             np.array(by_etl_results["Train"]["rmse"]),
             np.array(by_builtin_results["Train"]["rmse"]),
@@ -280,7 +287,7 @@ class TestTreeMethod:
         )
         assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
 
-        by_grouping: xgb.callback.TrainingCallback.EvalsLog = {}
+        by_grouping: Dict[str, Dict[str, List[float]]] = {}
         # switch to partition-based splits
         parameters["max_cat_to_onehot"] = self.USE_PART
         parameters["reg_lambda"] = 0
@@ -313,7 +320,9 @@ class TestTreeMethod:
            strategies.integers(1, 2), strategies.integers(4, 7))
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
-    def test_categorical_ohe(self, rows, cols, rounds, cats):
+    def test_categorical_ohe(
+        self, rows: int, cols: int, rounds: int, cats: int
+    ) -> None:
         self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
         self.run_categorical_ohe(rows, cols, rounds, cats, "hist")