Init estimation for regression. (#8272)

This commit is contained in:
Jiaming Yuan
2023-01-11 02:04:56 +08:00
committed by GitHub
parent 1b58d81315
commit badeff1d74
29 changed files with 466 additions and 132 deletions

View File

@@ -1,5 +1,6 @@
import numpy as np
import pytest
from xgboost.testing.updater import get_basescore
import xgboost as xgb
from xgboost import testing as tm
@@ -11,16 +12,12 @@ class TestEarlyStopping:
@pytest.mark.skipif(**tm.no_sklearn())
def test_early_stopping_nonparallel(self):
from sklearn.datasets import load_digits
try:
from sklearn.model_selection import train_test_split
except ImportError:
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
digits = load_digits(n_class=2)
X = digits['data']
y = digits['target']
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf1 = xgb.XGBClassifier(learning_rate=0.1)
clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc",
eval_set=[(X_test, y_test)])
@@ -31,9 +28,23 @@ class TestEarlyStopping:
assert clf1.best_score == clf2.best_score
assert clf1.best_score != 1
# check overfit
clf3 = xgb.XGBClassifier(learning_rate=0.1)
clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_test, y_test)])
clf3 = xgb.XGBClassifier(
learning_rate=0.1,
eval_metric="auc",
early_stopping_rounds=10
)
clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
base_score = get_basescore(clf3)
assert 0.53 > base_score > 0.5
clf3 = xgb.XGBClassifier(
learning_rate=0.1,
base_score=.5,
eval_metric="auc",
early_stopping_rounds=10
)
clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
assert clf3.best_score == 1
def evalerror(self, preds, dtrain):

View File

@@ -9,11 +9,13 @@ train_data = xgb.DMatrix(np.array([[1]]), label=np.array([1]))
class TestTreeRegularization:
def test_alpha(self):
params = {
'tree_method': 'exact', 'verbosity': 0,
'objective': 'reg:squarederror',
'eta': 1,
'lambda': 0,
'alpha': 0.1
"tree_method": "exact",
"verbosity": 0,
"objective": "reg:squarederror",
"eta": 1,
"lambda": 0,
"alpha": 0.1,
"base_score": 0.5,
}
model = xgb.train(params, train_data, 1)
@@ -27,11 +29,13 @@ class TestTreeRegularization:
def test_lambda(self):
params = {
'tree_method': 'exact', 'verbosity': 0,
'objective': 'reg:squarederror',
'eta': 1,
'lambda': 1,
'alpha': 0
"tree_method": "exact",
"verbosity": 0,
"objective": "reg:squarederror",
"eta": 1,
"lambda": 1,
"alpha": 0,
"base_score": 0.5,
}
model = xgb.train(params, train_data, 1)
@@ -45,11 +49,13 @@ class TestTreeRegularization:
def test_alpha_and_lambda(self):
params = {
'tree_method': 'exact', 'verbosity': 1,
'objective': 'reg:squarederror',
'eta': 1,
'lambda': 1,
'alpha': 0.1
"tree_method": "exact",
"verbosity": 1,
"objective": "reg:squarederror",
"eta": 1,
"lambda": 1,
"alpha": 0.1,
"base_score": 0.5,
}
model = xgb.train(params, train_data, 1)

View File

@@ -10,6 +10,7 @@ from xgboost.testing.params import (
exact_parameter_strategy,
hist_parameter_strategy,
)
from xgboost.testing.updater import check_init_estimation
import xgboost as xgb
from xgboost import testing as tm
@@ -449,3 +450,6 @@ class TestTreeMethod:
)
def test_adaptive(self, tree_method, weighted) -> None:
self.run_adaptive(tree_method, weighted)
def test_init_estimation(self) -> None:
check_init_estimation("hist")

View File

@@ -9,6 +9,7 @@ except Exception:
shap = None
pass
pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")
@@ -16,11 +17,16 @@ pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")
# Changes in binary format may cause problems
def test_with_shap():
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
dtrain = xgb.DMatrix(X, label=y)
model = xgb.train({"learning_rate": 0.01}, dtrain, 10)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
margin = model.predict(dtrain, output_margin=True)
assert np.allclose(np.sum(shap_values, axis=len(shap_values.shape) - 1),
margin - explainer.expected_value, 1e-3, 1e-3)
assert np.allclose(
np.sum(shap_values, axis=len(shap_values.shape) - 1),
margin - explainer.expected_value,
1e-3,
1e-3,
)

View File

@@ -9,6 +9,7 @@ import numpy as np
import pytest
from sklearn.utils.estimator_checks import parametrize_with_checks
from xgboost.testing.shared import get_feature_weights, validate_data_initialization
from xgboost.testing.updater import get_basescore
import xgboost as xgb
from xgboost import testing as tm
@@ -196,19 +197,22 @@ def test_stacking_classification():
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf.fit(X_train, y_train).score(X_test, y_test)
@pytest.mark.skipif(**tm.no_pandas())
def test_feature_importances_weight():
from sklearn.datasets import load_digits
digits = load_digits(n_class=2)
y = digits['target']
X = digits['data']
y = digits["target"]
X = digits["data"]
xgb_model = xgb.XGBClassifier(
random_state=0,
tree_method="exact",
learning_rate=0.1,
importance_type="weight",
base_score=0.5,
).fit(X, y)
xgb_model = xgb.XGBClassifier(random_state=0,
tree_method="exact",
learning_rate=0.1,
importance_type="weight").fit(X, y)
exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0.,
@@ -223,16 +227,22 @@ def test_feature_importances_weight():
import pandas as pd
y = pd.Series(digits['target'])
X = pd.DataFrame(digits['data'])
xgb_model = xgb.XGBClassifier(random_state=0,
tree_method="exact",
learning_rate=0.1,
importance_type="weight").fit(X, y)
xgb_model = xgb.XGBClassifier(
random_state=0,
tree_method="exact",
learning_rate=0.1,
base_score=.5,
importance_type="weight"
).fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
xgb_model = xgb.XGBClassifier(random_state=0,
tree_method="exact",
learning_rate=0.1,
importance_type="weight").fit(X, y)
xgb_model = xgb.XGBClassifier(
random_state=0,
tree_method="exact",
learning_rate=0.1,
importance_type="weight",
base_score=.5,
).fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
with pytest.raises(ValueError):
@@ -274,6 +284,7 @@ def test_feature_importances_gain():
random_state=0, tree_method="exact",
learning_rate=0.1,
importance_type="gain",
base_score=0.5,
).fit(X, y)
exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
@@ -296,6 +307,7 @@ def test_feature_importances_gain():
tree_method="exact",
learning_rate=0.1,
importance_type="gain",
base_score=0.5,
).fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
@@ -304,6 +316,7 @@ def test_feature_importances_gain():
tree_method="exact",
learning_rate=0.1,
importance_type="gain",
base_score=0.5,
).fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
@@ -593,18 +606,21 @@ def test_split_value_histograms():
digits_2class = load_digits(n_class=2)
X = digits_2class['data']
y = digits_2class['target']
X = digits_2class["data"]
y = digits_2class["target"]
dm = xgb.DMatrix(X, label=y)
params = {'max_depth': 6, 'eta': 0.01, 'verbosity': 0,
'objective': 'binary:logistic'}
params = {
"max_depth": 6,
"eta": 0.01,
"verbosity": 0,
"objective": "binary:logistic",
"base_score": 0.5,
}
gbdt = xgb.train(params, dm, num_boost_round=10)
assert gbdt.get_split_value_histogram("not_there",
as_pandas=True).shape[0] == 0
assert gbdt.get_split_value_histogram("not_there",
as_pandas=False).shape[0] == 0
assert gbdt.get_split_value_histogram("not_there", as_pandas=True).shape[0] == 0
assert gbdt.get_split_value_histogram("not_there", as_pandas=False).shape[0] == 0
assert gbdt.get_split_value_histogram("f28", bins=0).shape[0] == 1
assert gbdt.get_split_value_histogram("f28", bins=1).shape[0] == 1
assert gbdt.get_split_value_histogram("f28", bins=2).shape[0] == 2
@@ -748,11 +764,7 @@ def test_sklearn_get_default_params():
cls = xgb.XGBClassifier()
assert cls.get_params()["base_score"] is None
cls.fit(X[:4, ...], y[:4, ...])
base_score = float(
json.loads(cls.get_booster().save_config())["learner"]["learner_model_param"][
"base_score"
]
)
base_score = get_basescore(cls)
np.testing.assert_equal(base_score, 0.5)