Don't shuffle columns in categorical tests. (#8446)

This commit is contained in:
Jiaming Yuan 2022-11-28 20:28:06 +08:00 committed by GitHub
parent 67ea1c3435
commit f2209c1fe4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 30 additions and 15 deletions

View File

@ -486,6 +486,7 @@ def make_categorical(
onehot: bool,
sparsity: float = 0.0,
cat_ratio: float = 1.0,
shuffle: bool = False,
) -> Tuple[ArrayLike, np.ndarray]:
"""Generate categorical features for test.
@ -499,6 +500,8 @@ def make_categorical(
The ratio of the amount of missing values over the number of all entries.
cat_ratio:
The ratio of features that are categorical.
shuffle:
Whether we should shuffle the columns.
Returns
-------
@ -538,10 +541,12 @@ def make_categorical(
if onehot:
df = pd.get_dummies(df)
if shuffle:
columns = list(df.columns)
rng.shuffle(columns)
df = df[columns]
return pd.get_dummies(df), label
return df, label

View File

@ -170,6 +170,7 @@ def test_json_model() -> None:
onehot=False,
sparsity=0.5,
cat_ratio=0.5,
shuffle=True,
)
reg = xgboost.XGBRegressor(
n_estimators=2, tree_method="hist", enable_categorical=True

View File

@ -1,6 +1,6 @@
import json
from string import ascii_lowercase
from typing import Any, Dict
from typing import Any, Dict, List
import numpy as np
import pytest
@ -238,17 +238,24 @@ class TestTreeMethod:
# Test with partition-based split
run(self.USE_PART)
def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method):
def run_categorical_ohe(
self, rows: int, cols: int, rounds: int, cats: int, tree_method: str
) -> None:
onehot, label = tm.make_categorical(rows, cols, cats, True)
print(onehot.columns)
cat, _ = tm.make_categorical(rows, cols, cats, False)
print(cat.columns)
by_etl_results = {}
by_builtin_results = {}
by_etl_results: Dict[str, Dict[str, List[float]]] = {}
by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
parameters = {"tree_method": tree_method, "predictor": predictor}
# Use one-hot exclusively
parameters["max_cat_to_onehot"] = self.USE_ONEHOT
parameters: Dict[str, Any] = {
"tree_method": tree_method,
"predictor": predictor,
# Use one-hot exclusively
"max_cat_to_onehot": self.USE_ONEHOT
}
m = xgb.DMatrix(onehot, label, enable_categorical=False)
xgb.train(
@ -268,11 +275,11 @@ class TestTreeMethod:
evals_result=by_builtin_results,
)
# There are guidelines on how to specify tolerance based on considering output as
# random variables. But in here the tree construction is extremely sensitive to
# floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
# different tree. So even though the test is quite lenient, hypothesis can still
# pick up falsifying examples from time to time.
# There are guidelines on how to specify tolerance based on considering output
# as random variables. But in here the tree construction is extremely sensitive
# to floating point errors. An 1e-5 error in a histogram bin can lead to an
# entirely different tree. So even though the test is quite lenient, hypothesis
# can still pick up falsifying examples from time to time.
np.testing.assert_allclose(
np.array(by_etl_results["Train"]["rmse"]),
np.array(by_builtin_results["Train"]["rmse"]),
@ -280,7 +287,7 @@ class TestTreeMethod:
)
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
by_grouping: xgb.callback.TrainingCallback.EvalsLog = {}
by_grouping: Dict[str, Dict[str, List[float]]] = {}
# switch to partition-based splits
parameters["max_cat_to_onehot"] = self.USE_PART
parameters["reg_lambda"] = 0
@ -313,7 +320,9 @@ class TestTreeMethod:
strategies.integers(1, 2), strategies.integers(4, 7))
@settings(deadline=None, print_blob=True)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical_ohe(self, rows, cols, rounds, cats):
def test_categorical_ohe(
self, rows: int, cols: int, rounds: int, cats: int
) -> None:
self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
self.run_categorical_ohe(rows, cols, rounds, cats, "hist")