xgboost/tests/python/test_updaters.py

452 lines
16 KiB
Python

import json
from string import ascii_lowercase
from typing import Any, Dict, List
import numpy as np
import pytest
from hypothesis import given, note, settings, strategies
from xgboost.testing.params import (
cat_parameter_strategy,
exact_parameter_strategy,
hist_parameter_strategy,
)
import xgboost as xgb
from xgboost import testing as tm
def train_result(param, dmat, num_rounds):
result = {}
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
evals_result=result)
return result
class TestTreeMethod:
USE_ONEHOT = np.iinfo(np.int32).max
USE_PART = 1
@given(exact_parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@settings(deadline=None, print_blob=True)
def test_exact(self, param, num_rounds, dataset):
if dataset.name.endswith("-l1"):
return
param['tree_method'] = 'exact'
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)
assert tm.non_increasing(result['train'][dataset.metric])
@given(
exact_parameter_strategy,
hist_parameter_strategy,
strategies.integers(1, 20),
tm.dataset_strategy,
)
@settings(deadline=None, print_blob=True)
def test_approx(self, param, hist_param, num_rounds, dataset):
param["tree_method"] = "approx"
param = dataset.set_params(param)
param.update(hist_param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result["train"][dataset.metric])
@pytest.mark.skipif(**tm.no_sklearn())
def test_pruner(self):
import sklearn
params = {'tree_method': 'exact'}
cancer = sklearn.datasets.load_breast_cancer()
X = cancer['data']
y = cancer["target"]
dtrain = xgb.DMatrix(X, y)
booster = xgb.train(params, dtrain=dtrain, num_boost_round=10)
grown = str(booster.get_dump())
params = {'updater': 'prune', 'process_type': 'update', 'gamma': '0.2'}
booster = xgb.train(params, dtrain=dtrain, num_boost_round=10,
xgb_model=booster)
after_prune = str(booster.get_dump())
assert grown != after_prune
booster = xgb.train(params, dtrain=dtrain, num_boost_round=10,
xgb_model=booster)
second_prune = str(booster.get_dump())
# Second prune should not change the tree
assert after_prune == second_prune
@given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@settings(deadline=None, print_blob=True)
def test_hist(self, param, hist_param, num_rounds, dataset):
param['tree_method'] = 'hist'
param = dataset.set_params(param)
param.update(hist_param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result['train'][dataset.metric])
@given(tm.sparse_datasets_strategy)
@settings(deadline=None, print_blob=True)
def test_sparse(self, dataset):
param = {"tree_method": "hist", "max_bin": 64}
hist_result = train_result(param, dataset.get_dmat(), 16)
note(hist_result)
assert tm.non_increasing(hist_result['train'][dataset.metric])
param = {"tree_method": "approx", "max_bin": 64}
approx_result = train_result(param, dataset.get_dmat(), 16)
note(approx_result)
assert tm.non_increasing(approx_result['train'][dataset.metric])
np.testing.assert_allclose(
hist_result["train"]["rmse"], approx_result["train"]["rmse"]
)
def test_hist_categorical(self):
# hist must be same as exact on all-categorial data
dpath = 'demo/data/'
ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
ag_param = {'max_depth': 2,
'tree_method': 'hist',
'eta': 1,
'verbosity': 0,
'objective': 'binary:logistic',
'eval_metric': 'auc'}
hist_res = {}
exact_res = {}
xgb.train(ag_param, ag_dtrain, 10,
[(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=hist_res)
ag_param["tree_method"] = "exact"
xgb.train(ag_param, ag_dtrain, 10,
[(ag_dtrain, 'train'), (ag_dtest, 'test')],
evals_result=exact_res)
assert hist_res['train']['auc'] == exact_res['train']['auc']
assert hist_res['test']['auc'] == exact_res['test']['auc']
@pytest.mark.skipif(**tm.no_sklearn())
def test_hist_degenerate_case(self):
# Test a degenerate case where the quantile sketcher won't return any
# quantile points for a particular feature (the second feature in
# this example). Source: https://github.com/dmlc/xgboost/issues/2943
nan = np.nan
param = {'missing': nan, 'tree_method': 'hist'}
model = xgb.XGBRegressor(**param)
X = np.array([[6.18827160e+05, 1.73000000e+02], [6.37345679e+05, nan],
[6.38888889e+05, nan], [6.28086420e+05, nan]])
y = [1000000., 0., 0., 500000.]
w = [0, 0, 1, 0]
model.fit(X, y, sample_weight=w)
def run_invalid_category(self, tree_method: str) -> None:
rng = np.random.default_rng()
# too large
X = rng.integers(low=0, high=4, size=1000).reshape(100, 10)
y = rng.normal(loc=0, scale=1, size=100)
X[13, 7] = np.iinfo(np.int32).max + 1
# Check is performed during sketching.
Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
with pytest.raises(ValueError):
xgb.train({"tree_method": tree_method}, Xy)
X[13, 7] = 16777216
Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
with pytest.raises(ValueError):
xgb.train({"tree_method": tree_method}, Xy)
# mixed positive and negative values
X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)
y = rng.normal(loc=0, scale=1, size=100)
Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
with pytest.raises(ValueError):
xgb.train({"tree_method": tree_method}, Xy)
if tree_method == "gpu_hist":
import cupy as cp
X, y = cp.array(X), cp.array(y)
with pytest.raises(ValueError):
Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10)
def test_invalid_category(self) -> None:
self.run_invalid_category("approx")
self.run_invalid_category("hist")
def run_max_cat(self, tree_method: str) -> None:
"""Test data with size smaller than number of categories."""
import pandas as pd
rng = np.random.default_rng(0)
n_cat = 100
n = 5
X = pd.Series(
["".join(rng.choice(list(ascii_lowercase), size=3)) for i in range(n_cat)],
dtype="category",
)[:n].to_frame()
reg = xgb.XGBRegressor(
enable_categorical=True,
tree_method=tree_method,
n_estimators=10,
)
y = pd.Series(range(n))
reg.fit(X=X, y=y, eval_set=[(X, y)])
assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
@pytest.mark.skipif(**tm.no_pandas())
def test_max_cat(self, tree_method) -> None:
self.run_max_cat(tree_method)
def run_categorical_missing(
self, rows: int, cols: int, cats: int, tree_method: str
) -> None:
parameters: Dict[str, Any] = {"tree_method": tree_method}
cat, label = tm.make_categorical(
n_samples=rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
)
Xy = xgb.DMatrix(cat, label, enable_categorical=True)
def run(max_cat_to_onehot: int):
# Test with onehot splits
parameters["max_cat_to_onehot"] = max_cat_to_onehot
evals_result: Dict[str, Dict] = {}
booster = xgb.train(
parameters,
Xy,
num_boost_round=16,
evals=[(Xy, "Train")],
evals_result=evals_result
)
assert tm.non_increasing(evals_result["Train"]["rmse"])
y_predt = booster.predict(Xy)
rmse = tm.root_mean_square(label, y_predt)
np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])
# Test with OHE split
run(self.USE_ONEHOT)
# Test with partition-based split
run(self.USE_PART)
def run_categorical_ohe(
self, rows: int, cols: int, rounds: int, cats: int, tree_method: str
) -> None:
onehot, label = tm.make_categorical(rows, cols, cats, True)
cat, _ = tm.make_categorical(rows, cols, cats, False)
by_etl_results: Dict[str, Dict[str, List[float]]] = {}
by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
parameters: Dict[str, Any] = {
"tree_method": tree_method,
"predictor": predictor,
# Use one-hot exclusively
"max_cat_to_onehot": self.USE_ONEHOT
}
m = xgb.DMatrix(onehot, label, enable_categorical=False)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_etl_results,
)
m = xgb.DMatrix(cat, label, enable_categorical=True)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_builtin_results,
)
# There are guidelines on how to specify tolerance based on considering output
# as random variables. But in here the tree construction is extremely sensitive
# to floating point errors. An 1e-5 error in a histogram bin can lead to an
# entirely different tree. So even though the test is quite lenient, hypothesis
# can still pick up falsifying examples from time to time.
np.testing.assert_allclose(
np.array(by_etl_results["Train"]["rmse"]),
np.array(by_builtin_results["Train"]["rmse"]),
rtol=1e-3,
)
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
by_grouping: Dict[str, Dict[str, List[float]]] = {}
# switch to partition-based splits
parameters["max_cat_to_onehot"] = self.USE_PART
parameters["reg_lambda"] = 0
m = xgb.DMatrix(cat, label, enable_categorical=True)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_grouping,
)
rmse_oh = by_builtin_results["Train"]["rmse"]
rmse_group = by_grouping["Train"]["rmse"]
# always better or equal to onehot when there's no regularization.
for a, b in zip(rmse_oh, rmse_group):
assert a >= b
parameters["reg_lambda"] = 1.0
by_grouping = {}
xgb.train(
parameters,
m,
num_boost_round=32,
evals=[(m, "Train")],
evals_result=by_grouping,
)
assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping
@given(strategies.integers(10, 400), strategies.integers(3, 8),
strategies.integers(1, 2), strategies.integers(4, 7))
@settings(deadline=None, print_blob=True)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical_ohe(
self, rows: int, cols: int, rounds: int, cats: int
) -> None:
self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
self.run_categorical_ohe(rows, cols, rounds, cats, "hist")
@given(
tm.categorical_dataset_strategy,
exact_parameter_strategy,
hist_parameter_strategy,
cat_parameter_strategy,
strategies.integers(4, 32),
strategies.sampled_from(["hist", "approx"]),
)
@settings(deadline=None, print_blob=True)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical(
self,
dataset: tm.TestDataset,
exact_parameters: Dict[str, Any],
hist_parameters: Dict[str, Any],
cat_parameters: Dict[str, Any],
n_rounds: int,
tree_method: str,
) -> None:
cat_parameters.update(exact_parameters)
cat_parameters.update(hist_parameters)
cat_parameters["tree_method"] = tree_method
results = train_result(cat_parameters, dataset.get_dmat(), n_rounds)
tm.non_increasing(results["train"]["rmse"])
@given(
hist_parameter_strategy,
cat_parameter_strategy,
strategies.sampled_from(["hist", "approx"]),
)
@settings(deadline=None, print_blob=True)
def test_categorical_ames_housing(
self,
hist_parameters: Dict[str, Any],
cat_parameters: Dict[str, Any],
tree_method: str,
) -> None:
cat_parameters.update(hist_parameters)
dataset = tm.TestDataset(
"ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
)
cat_parameters["tree_method"] = tree_method
results = train_result(cat_parameters, dataset.get_dmat(), 16)
tm.non_increasing(results["train"]["rmse"])
@given(
strategies.integers(10, 400),
strategies.integers(3, 8),
strategies.integers(4, 7)
)
@settings(deadline=None, print_blob=True)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical_missing(self, rows, cols, cats):
self.run_categorical_missing(rows, cols, cats, "approx")
self.run_categorical_missing(rows, cols, cats, "hist")
def run_adaptive(self, tree_method, weighted) -> None:
rng = np.random.RandomState(1994)
from sklearn.datasets import make_regression
from sklearn.utils import stats
n_samples = 256
X, y = make_regression(n_samples, 16, random_state=rng)
if weighted:
w = rng.normal(size=n_samples)
w -= w.min()
Xy = xgb.DMatrix(X, y, weight=w)
base_score = stats._weighted_percentile(y, w, percentile=50)
else:
Xy = xgb.DMatrix(X, y)
base_score = np.median(y)
booster_0 = xgb.train(
{
"tree_method": tree_method,
"base_score": base_score,
"objective": "reg:absoluteerror",
},
Xy,
num_boost_round=1,
)
booster_1 = xgb.train(
{"tree_method": tree_method, "objective": "reg:absoluteerror"},
Xy,
num_boost_round=1,
)
config_0 = json.loads(booster_0.save_config())
config_1 = json.loads(booster_1.save_config())
def get_score(config: Dict) -> float:
return float(config["learner"]["learner_model_param"]["base_score"])
assert get_score(config_0) == get_score(config_1)
raw_booster = booster_1.save_raw(raw_format="deprecated")
booster_2 = xgb.Booster(model_file=raw_booster)
config_2 = json.loads(booster_2.save_config())
assert get_score(config_1) == get_score(config_2)
raw_booster = booster_1.save_raw(raw_format="ubj")
booster_2 = xgb.Booster(model_file=raw_booster)
config_2 = json.loads(booster_2.save_config())
assert get_score(config_1) == get_score(config_2)
booster_0 = xgb.train(
{
"tree_method": tree_method,
"base_score": base_score + 1.0,
"objective": "reg:absoluteerror",
},
Xy,
num_boost_round=1,
)
config_0 = json.loads(booster_0.save_config())
np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
@pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize(
"tree_method,weighted", [
("approx", False), ("hist", False), ("approx", True), ("hist", True)
]
)
def test_adaptive(self, tree_method, weighted) -> None:
self.run_adaptive(tree_method, weighted)