Handle categorical split in model histogram and dataframe. (#7065)
* Error on get_split_value_histogram when feature is categorical * Add a category column to output dataframe
This commit is contained in:
@@ -32,15 +32,14 @@ def train_result(param, dmat, num_rounds):
|
||||
|
||||
|
||||
class TestGPUUpdaters:
|
||||
@given(parameter_strategy, strategies.integers(1, 20),
|
||||
tm.dataset_strategy)
|
||||
@given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
|
||||
@settings(deadline=None)
|
||||
def test_gpu_hist(self, param, num_rounds, dataset):
|
||||
param['tree_method'] = 'gpu_hist'
|
||||
param["tree_method"] = "gpu_hist"
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
note(result)
|
||||
assert tm.non_increasing(result['train'][dataset.metric])
|
||||
assert tm.non_increasing(result["train"][dataset.metric])
|
||||
|
||||
def run_categorical_basic(self, rows, cols, rounds, cats):
|
||||
onehot, label = tm.make_categorical(rows, cols, cats, True)
|
||||
@@ -49,25 +48,40 @@ class TestGPUUpdaters:
|
||||
by_etl_results = {}
|
||||
by_builtin_results = {}
|
||||
|
||||
parameters = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
|
||||
parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}
|
||||
|
||||
m = xgb.DMatrix(onehot, label, enable_categorical=True)
|
||||
xgb.train(parameters, m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, 'Train')], evals_result=by_etl_results)
|
||||
m = xgb.DMatrix(onehot, label, enable_categorical=False)
|
||||
xgb.train(
|
||||
parameters,
|
||||
m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, "Train")],
|
||||
evals_result=by_etl_results,
|
||||
)
|
||||
|
||||
m = xgb.DMatrix(cat, label, enable_categorical=True)
|
||||
xgb.train(parameters, m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, 'Train')], evals_result=by_builtin_results)
|
||||
xgb.train(
|
||||
parameters,
|
||||
m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, "Train")],
|
||||
evals_result=by_builtin_results,
|
||||
)
|
||||
|
||||
# There are guidelines on how to specify tolerance based on considering output as
|
||||
# random variables. But in here the tree construction is extremely sensitive to
|
||||
# floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
|
||||
# different tree. So even though the test is quite lenient, hypothesis can still
|
||||
# pick up falsifying examples from time to time.
|
||||
np.testing.assert_allclose(
|
||||
np.array(by_etl_results['Train']['rmse']),
|
||||
np.array(by_builtin_results['Train']['rmse']),
|
||||
rtol=1e-3)
|
||||
assert tm.non_increasing(by_builtin_results['Train']['rmse'])
|
||||
np.array(by_etl_results["Train"]["rmse"]),
|
||||
np.array(by_builtin_results["Train"]["rmse"]),
|
||||
rtol=1e-3,
|
||||
)
|
||||
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
|
||||
|
||||
@given(strategies.integers(10, 400), strategies.integers(3, 8),
|
||||
strategies.integers(1, 5), strategies.integers(4, 7))
|
||||
strategies.integers(1, 2), strategies.integers(4, 7))
|
||||
@settings(deadline=None)
|
||||
@pytest.mark.skipif(**tm.no_pandas())
|
||||
def test_categorical(self, rows, cols, rounds, cats):
|
||||
|
||||
Reference in New Issue
Block a user