Don't shuffle columns in categorical tests. (#8446)
This commit is contained in:
parent
67ea1c3435
commit
f2209c1fe4
@ -486,6 +486,7 @@ def make_categorical(
|
|||||||
onehot: bool,
|
onehot: bool,
|
||||||
sparsity: float = 0.0,
|
sparsity: float = 0.0,
|
||||||
cat_ratio: float = 1.0,
|
cat_ratio: float = 1.0,
|
||||||
|
shuffle: bool = False,
|
||||||
) -> Tuple[ArrayLike, np.ndarray]:
|
) -> Tuple[ArrayLike, np.ndarray]:
|
||||||
"""Generate categorical features for test.
|
"""Generate categorical features for test.
|
||||||
|
|
||||||
@ -499,6 +500,8 @@ def make_categorical(
|
|||||||
The ratio of the amount of missing values over the number of all entries.
|
The ratio of the amount of missing values over the number of all entries.
|
||||||
cat_ratio:
|
cat_ratio:
|
||||||
The ratio of features that are categorical.
|
The ratio of features that are categorical.
|
||||||
|
shuffle:
|
||||||
|
Whether we should shuffle the columns.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
@ -538,10 +541,12 @@ def make_categorical(
|
|||||||
|
|
||||||
if onehot:
|
if onehot:
|
||||||
df = pd.get_dummies(df)
|
df = pd.get_dummies(df)
|
||||||
|
|
||||||
|
if shuffle:
|
||||||
columns = list(df.columns)
|
columns = list(df.columns)
|
||||||
rng.shuffle(columns)
|
rng.shuffle(columns)
|
||||||
df = df[columns]
|
df = df[columns]
|
||||||
return pd.get_dummies(df), label
|
|
||||||
return df, label
|
return df, label
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -170,6 +170,7 @@ def test_json_model() -> None:
|
|||||||
onehot=False,
|
onehot=False,
|
||||||
sparsity=0.5,
|
sparsity=0.5,
|
||||||
cat_ratio=0.5,
|
cat_ratio=0.5,
|
||||||
|
shuffle=True,
|
||||||
)
|
)
|
||||||
reg = xgboost.XGBRegressor(
|
reg = xgboost.XGBRegressor(
|
||||||
n_estimators=2, tree_method="hist", enable_categorical=True
|
n_estimators=2, tree_method="hist", enable_categorical=True
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from string import ascii_lowercase
|
from string import ascii_lowercase
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
@ -238,17 +238,24 @@ class TestTreeMethod:
|
|||||||
# Test with partition-based split
|
# Test with partition-based split
|
||||||
run(self.USE_PART)
|
run(self.USE_PART)
|
||||||
|
|
||||||
def run_categorical_ohe(self, rows, cols, rounds, cats, tree_method):
|
def run_categorical_ohe(
|
||||||
|
self, rows: int, cols: int, rounds: int, cats: int, tree_method: str
|
||||||
|
) -> None:
|
||||||
onehot, label = tm.make_categorical(rows, cols, cats, True)
|
onehot, label = tm.make_categorical(rows, cols, cats, True)
|
||||||
|
print(onehot.columns)
|
||||||
cat, _ = tm.make_categorical(rows, cols, cats, False)
|
cat, _ = tm.make_categorical(rows, cols, cats, False)
|
||||||
|
print(cat.columns)
|
||||||
|
|
||||||
by_etl_results = {}
|
by_etl_results: Dict[str, Dict[str, List[float]]] = {}
|
||||||
by_builtin_results = {}
|
by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
|
||||||
|
|
||||||
predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
|
predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
|
||||||
parameters = {"tree_method": tree_method, "predictor": predictor}
|
parameters: Dict[str, Any] = {
|
||||||
# Use one-hot exclusively
|
"tree_method": tree_method,
|
||||||
parameters["max_cat_to_onehot"] = self.USE_ONEHOT
|
"predictor": predictor,
|
||||||
|
# Use one-hot exclusively
|
||||||
|
"max_cat_to_onehot": self.USE_ONEHOT
|
||||||
|
}
|
||||||
|
|
||||||
m = xgb.DMatrix(onehot, label, enable_categorical=False)
|
m = xgb.DMatrix(onehot, label, enable_categorical=False)
|
||||||
xgb.train(
|
xgb.train(
|
||||||
@ -268,11 +275,11 @@ class TestTreeMethod:
|
|||||||
evals_result=by_builtin_results,
|
evals_result=by_builtin_results,
|
||||||
)
|
)
|
||||||
|
|
||||||
# There are guidelines on how to specify tolerance based on considering output as
|
# There are guidelines on how to specify tolerance based on considering output
|
||||||
# random variables. But in here the tree construction is extremely sensitive to
|
# as random variables. But in here the tree construction is extremely sensitive
|
||||||
# floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
|
# to floating point errors. An 1e-5 error in a histogram bin can lead to an
|
||||||
# different tree. So even though the test is quite lenient, hypothesis can still
|
# entirely different tree. So even though the test is quite lenient, hypothesis
|
||||||
# pick up falsifying examples from time to time.
|
# can still pick up falsifying examples from time to time.
|
||||||
np.testing.assert_allclose(
|
np.testing.assert_allclose(
|
||||||
np.array(by_etl_results["Train"]["rmse"]),
|
np.array(by_etl_results["Train"]["rmse"]),
|
||||||
np.array(by_builtin_results["Train"]["rmse"]),
|
np.array(by_builtin_results["Train"]["rmse"]),
|
||||||
@ -280,7 +287,7 @@ class TestTreeMethod:
|
|||||||
)
|
)
|
||||||
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
|
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
|
||||||
|
|
||||||
by_grouping: xgb.callback.TrainingCallback.EvalsLog = {}
|
by_grouping: Dict[str, Dict[str, List[float]]] = {}
|
||||||
# switch to partition-based splits
|
# switch to partition-based splits
|
||||||
parameters["max_cat_to_onehot"] = self.USE_PART
|
parameters["max_cat_to_onehot"] = self.USE_PART
|
||||||
parameters["reg_lambda"] = 0
|
parameters["reg_lambda"] = 0
|
||||||
@ -313,7 +320,9 @@ class TestTreeMethod:
|
|||||||
strategies.integers(1, 2), strategies.integers(4, 7))
|
strategies.integers(1, 2), strategies.integers(4, 7))
|
||||||
@settings(deadline=None, print_blob=True)
|
@settings(deadline=None, print_blob=True)
|
||||||
@pytest.mark.skipif(**tm.no_pandas())
|
@pytest.mark.skipif(**tm.no_pandas())
|
||||||
def test_categorical_ohe(self, rows, cols, rounds, cats):
|
def test_categorical_ohe(
|
||||||
|
self, rows: int, cols: int, rounds: int, cats: int
|
||||||
|
) -> None:
|
||||||
self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
|
self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
|
||||||
self.run_categorical_ohe(rows, cols, rounds, cats, "hist")
|
self.run_categorical_ohe(rows, cols, rounds, cats, "hist")
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user