[breaking] Remove deprecated parameters in the skl interface. (#9986)

This commit is contained in:
Jiaming Yuan
2024-01-15 20:40:05 +08:00
committed by GitHub
parent 2de85d3241
commit 0798e36d73
16 changed files with 418 additions and 462 deletions

View File

@@ -9,37 +9,41 @@ rng = np.random.RandomState(1337)
class TestEvalMetrics:
xgb_params_01 = {'nthread': 1, 'eval_metric': 'error'}
xgb_params_01 = {"nthread": 1, "eval_metric": "error"}
xgb_params_02 = {'nthread': 1, 'eval_metric': ['error']}
xgb_params_02 = {"nthread": 1, "eval_metric": ["error"]}
xgb_params_03 = {'nthread': 1, 'eval_metric': ['rmse', 'error']}
xgb_params_03 = {"nthread": 1, "eval_metric": ["rmse", "error"]}
xgb_params_04 = {'nthread': 1, 'eval_metric': ['error', 'rmse']}
xgb_params_04 = {"nthread": 1, "eval_metric": ["error", "rmse"]}
def evalerror_01(self, preds, dtrain):
labels = dtrain.get_label()
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
return "error", float(sum(labels != (preds > 0.0))) / len(labels)
def evalerror_02(self, preds, dtrain):
labels = dtrain.get_label()
return [('error', float(sum(labels != (preds > 0.0))) / len(labels))]
return [("error", float(sum(labels != (preds > 0.0))) / len(labels))]
@pytest.mark.skipif(**tm.no_sklearn())
def evalerror_03(self, preds, dtrain):
from sklearn.metrics import mean_squared_error
labels = dtrain.get_label()
return [('rmse', mean_squared_error(labels, preds)),
('error', float(sum(labels != (preds > 0.0))) / len(labels))]
return [
("rmse", mean_squared_error(labels, preds)),
("error", float(sum(labels != (preds > 0.0))) / len(labels)),
]
@pytest.mark.skipif(**tm.no_sklearn())
def evalerror_04(self, preds, dtrain):
from sklearn.metrics import mean_squared_error
labels = dtrain.get_label()
return [('error', float(sum(labels != (preds > 0.0))) / len(labels)),
('rmse', mean_squared_error(labels, preds))]
return [
("error", float(sum(labels != (preds > 0.0))) / len(labels)),
("rmse", mean_squared_error(labels, preds)),
]
@pytest.mark.skipif(**tm.no_sklearn())
def test_eval_metrics(self):
@@ -50,15 +54,15 @@ class TestEvalMetrics:
from sklearn.datasets import load_digits
digits = load_digits(n_class=2)
X = digits['data']
y = digits['target']
X = digits["data"]
y = digits["target"]
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)
dtrain = xgb.DMatrix(Xt, label=yt)
dvalid = xgb.DMatrix(Xv, label=yv)
watchlist = [(dtrain, 'train'), (dvalid, 'val')]
watchlist = [(dtrain, "train"), (dvalid, "val")]
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, num_boost_round=10)
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, num_boost_round=10)
@@ -66,26 +70,54 @@ class TestEvalMetrics:
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
early_stopping_rounds=2)
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
early_stopping_rounds=2)
gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
early_stopping_rounds=2)
gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
early_stopping_rounds=2)
gbdt_01 = xgb.train(
self.xgb_params_01, dtrain, 10, watchlist, early_stopping_rounds=2
)
gbdt_02 = xgb.train(
self.xgb_params_02, dtrain, 10, watchlist, early_stopping_rounds=2
)
gbdt_03 = xgb.train(
self.xgb_params_03, dtrain, 10, watchlist, early_stopping_rounds=2
)
gbdt_04 = xgb.train(
self.xgb_params_04, dtrain, 10, watchlist, early_stopping_rounds=2
)
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
early_stopping_rounds=2, feval=self.evalerror_01)
gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
early_stopping_rounds=2, feval=self.evalerror_02)
gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
early_stopping_rounds=2, feval=self.evalerror_03)
gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
early_stopping_rounds=2, feval=self.evalerror_04)
gbdt_01 = xgb.train(
self.xgb_params_01,
dtrain,
10,
watchlist,
early_stopping_rounds=2,
feval=self.evalerror_01,
)
gbdt_02 = xgb.train(
self.xgb_params_02,
dtrain,
10,
watchlist,
early_stopping_rounds=2,
feval=self.evalerror_02,
)
gbdt_03 = xgb.train(
self.xgb_params_03,
dtrain,
10,
watchlist,
early_stopping_rounds=2,
feval=self.evalerror_03,
)
gbdt_04 = xgb.train(
self.xgb_params_04,
dtrain,
10,
watchlist,
early_stopping_rounds=2,
feval=self.evalerror_04,
)
assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
@@ -93,6 +125,7 @@ class TestEvalMetrics:
@pytest.mark.skipif(**tm.no_sklearn())
def test_gamma_deviance(self):
from sklearn.metrics import mean_gamma_deviance
rng = np.random.RandomState(1994)
n_samples = 100
n_features = 30
@@ -101,8 +134,13 @@ class TestEvalMetrics:
y = rng.randn(n_samples)
y = y - y.min() * 100
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=10)
reg.fit(X, y, eval_metric="gamma-deviance")
reg = xgb.XGBRegressor(
tree_method="hist",
objective="reg:gamma",
n_estimators=10,
eval_metric="gamma-deviance",
)
reg.fit(X, y)
booster = reg.get_booster()
score = reg.predict(X)
@@ -113,16 +151,26 @@ class TestEvalMetrics:
@pytest.mark.skipif(**tm.no_sklearn())
def test_gamma_lik(self) -> None:
import scipy.stats as stats
rng = np.random.default_rng(1994)
n_samples = 32
n_features = 10
X = rng.normal(0, 1, size=n_samples * n_features).reshape((n_samples, n_features))
X = rng.normal(0, 1, size=n_samples * n_features).reshape(
(n_samples, n_features)
)
alpha, loc, beta = 5.0, 11.1, 22
y = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=n_samples, random_state=rng)
reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=64)
reg.fit(X, y, eval_metric="gamma-nloglik", eval_set=[(X, y)])
y = stats.gamma.rvs(
alpha, loc=loc, scale=beta, size=n_samples, random_state=rng
)
reg = xgb.XGBRegressor(
tree_method="hist",
objective="reg:gamma",
n_estimators=64,
eval_metric="gamma-nloglik",
)
reg.fit(X, y, eval_set=[(X, y)])
score = reg.predict(X)
@@ -134,7 +182,7 @@ class TestEvalMetrics:
# XGBoost uses the canonical link function of gamma in evaluation function.
# so \theta = - (1.0 / y)
# dispersion is hardcoded as 1.0, so shape (a in scipy parameter) is also 1.0
beta = - (1.0 / (- (1.0 / y))) # == y
beta = -(1.0 / (-(1.0 / y))) # == y
nloglik_stats = -stats.gamma.logpdf(score, a=1.0, scale=beta)
np.testing.assert_allclose(nloglik, np.mean(nloglik_stats), rtol=1e-3)
@@ -153,7 +201,7 @@ class TestEvalMetrics:
n_features,
n_informative=n_features,
n_redundant=0,
random_state=rng
random_state=rng,
)
Xy = xgb.DMatrix(X, y)
booster = xgb.train(
@@ -197,7 +245,7 @@ class TestEvalMetrics:
n_informative=n_features,
n_redundant=0,
n_classes=n_classes,
random_state=rng
random_state=rng,
)
if weighted:
weights = rng.randn(n_samples)
@@ -242,20 +290,25 @@ class TestEvalMetrics:
def run_pr_auc_binary(self, tree_method):
from sklearn.datasets import make_classification
from sklearn.metrics import auc, precision_recall_curve
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
clf = xgb.XGBClassifier(
tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
)
clf.fit(X, y, eval_set=[(X, y)])
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
y_score = clf.predict_proba(X)[:, 1] # get the positive column
precision, recall, _ = precision_recall_curve(y, y_score)
prauc = auc(recall, precision)
# Interpolation results are slightly different from sklearn, but overall should be
# similar.
# Interpolation results are slightly different from sklearn, but overall should
# be similar.
np.testing.assert_allclose(prauc, evals_result, rtol=1e-2)
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=10)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
clf = xgb.XGBClassifier(
tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
)
clf.fit(X, y, eval_set=[(X, y)])
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
np.testing.assert_allclose(0.99, evals_result, rtol=1e-2)
@@ -264,16 +317,21 @@ class TestEvalMetrics:
def run_pr_auc_multi(self, tree_method):
from sklearn.datasets import make_classification
X, y = make_classification(
64, 16, n_informative=8, n_classes=3, random_state=1994
)
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
clf = xgb.XGBClassifier(
tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
)
clf.fit(X, y, eval_set=[(X, y)])
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
# No available implementation for comparison, just check that XGBoost converges to
# 1.0
clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=10)
clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
# No available implementation for comparison, just check that XGBoost converges
# to 1.0
clf = xgb.XGBClassifier(
tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
)
clf.fit(X, y, eval_set=[(X, y)])
evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
np.testing.assert_allclose(1.0, evals_result, rtol=1e-2)
@@ -282,9 +340,13 @@ class TestEvalMetrics:
def run_pr_auc_ltr(self, tree_method):
from sklearn.datasets import make_classification
X, y = make_classification(128, 4, n_classes=2, random_state=1994)
ltr = xgb.XGBRanker(
tree_method=tree_method, n_estimators=16, objective="rank:pairwise"
tree_method=tree_method,
n_estimators=16,
objective="rank:pairwise",
eval_metric="aucpr",
)
groups = np.array([32, 32, 64])
ltr.fit(
@@ -293,7 +355,6 @@ class TestEvalMetrics:
group=groups,
eval_set=[(X, y)],
eval_group=[groups],
eval_metric="aucpr",
)
results = ltr.evals_result()["validation_0"]["aucpr"]
assert results[-1] >= 0.99