Remove ntree limit in python package. (#8345)
- Remove `ntree_limit`. The parameter has been deprecated since 1.4.0. - The SHAP package compatibility is broken.
This commit is contained in:
@@ -64,7 +64,7 @@ class TestModels:
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
# this is prediction
|
||||
preds = bst.predict(dtest, ntree_limit=num_round)
|
||||
preds = bst.predict(dtest, iteration_range=(0, num_round))
|
||||
labels = dtest.get_label()
|
||||
err = sum(1 for i in range(len(preds))
|
||||
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||
@@ -83,7 +83,7 @@ class TestModels:
|
||||
bst2 = xgb.Booster(params=param, model_file=model_path)
|
||||
dtest2 = xgb.DMatrix(dtest_path)
|
||||
|
||||
preds2 = bst2.predict(dtest2, ntree_limit=num_round)
|
||||
preds2 = bst2.predict(dtest2, iteration_range=(0, num_round))
|
||||
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds2 - preds)) == 0
|
||||
@@ -96,7 +96,7 @@ class TestModels:
|
||||
# check whether custom evaluation metrics work
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist,
|
||||
feval=my_logloss)
|
||||
preds3 = bst.predict(dtest, ntree_limit=num_round)
|
||||
preds3 = bst.predict(dtest, iteration_range=(0, num_round))
|
||||
assert all(preds3 == preds)
|
||||
|
||||
# check whether sample_type and normalize_type work
|
||||
@@ -110,7 +110,7 @@ class TestModels:
|
||||
param['sample_type'] = p[0]
|
||||
param['normalize_type'] = p[1]
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
preds = bst.predict(dtest, ntree_limit=num_round)
|
||||
preds = bst.predict(dtest, iteration_range=(0, num_round))
|
||||
err = sum(1 for i in range(len(preds))
|
||||
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
||||
assert err < 0.1
|
||||
@@ -472,8 +472,8 @@ class TestModels:
|
||||
X, y = load_iris(return_X_y=True)
|
||||
cls = xgb.XGBClassifier(n_estimators=2)
|
||||
cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
|
||||
assert cls.get_booster().best_ntree_limit == 2
|
||||
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
|
||||
assert cls.get_booster().best_iteration == cls.n_estimators - 1
|
||||
assert cls.best_iteration == cls.get_booster().best_iteration
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
path = os.path.join(tmpdir, "cls.json")
|
||||
@@ -481,8 +481,8 @@ class TestModels:
|
||||
|
||||
cls = xgb.XGBClassifier(n_estimators=2)
|
||||
cls.load_model(path)
|
||||
assert cls.get_booster().best_ntree_limit == 2
|
||||
assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
|
||||
assert cls.get_booster().best_iteration == cls.n_estimators - 1
|
||||
assert cls.best_iteration == cls.get_booster().best_iteration
|
||||
|
||||
def run_slice(
|
||||
self,
|
||||
|
||||
@@ -102,7 +102,6 @@ eval[test] = {data_path}
|
||||
booster.feature_names = None
|
||||
booster.feature_types = None
|
||||
booster.set_attr(best_iteration=None)
|
||||
booster.set_attr(best_ntree_limit=None)
|
||||
|
||||
booster.save_model(model_out_py)
|
||||
py_predt = booster.predict(data)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
'''Tests for running inplace prediction.'''
|
||||
"""Tests for running inplace prediction."""
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import numpy as np
|
||||
@@ -17,10 +17,10 @@ def run_threaded_predict(X, rows, predict_func):
|
||||
per_thread = 20
|
||||
with ThreadPoolExecutor(max_workers=10) as e:
|
||||
for i in range(0, rows, int(rows / per_thread)):
|
||||
if hasattr(X, 'iloc'):
|
||||
predictor = X.iloc[i:i+per_thread, :]
|
||||
if hasattr(X, "iloc"):
|
||||
predictor = X.iloc[i : i + per_thread, :]
|
||||
else:
|
||||
predictor = X[i:i+per_thread, ...]
|
||||
predictor = X[i : i + per_thread, ...]
|
||||
f = e.submit(predict_func, predictor)
|
||||
results.append(f)
|
||||
|
||||
@@ -61,27 +61,31 @@ def run_predict_leaf(predictor):
|
||||
|
||||
validate_leaf_output(leaf, num_parallel_tree)
|
||||
|
||||
ntree_limit = 2
|
||||
n_iters = 2
|
||||
sliced = booster.predict(
|
||||
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit, strict_shape=True
|
||||
m,
|
||||
pred_leaf=True,
|
||||
iteration_range=(0, n_iters),
|
||||
strict_shape=True,
|
||||
)
|
||||
first = sliced[0, ...]
|
||||
|
||||
assert np.prod(first.shape) == classes * num_parallel_tree * ntree_limit
|
||||
assert np.prod(first.shape) == classes * num_parallel_tree * n_iters
|
||||
|
||||
# When there's only 1 tree, the output is a 1 dim vector
|
||||
booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
|
||||
assert booster.predict(m, pred_leaf=True).shape == (rows, )
|
||||
assert booster.predict(m, pred_leaf=True).shape == (rows,)
|
||||
|
||||
return leaf
|
||||
|
||||
|
||||
def test_predict_leaf():
|
||||
run_predict_leaf('cpu_predictor')
|
||||
run_predict_leaf("cpu_predictor")
|
||||
|
||||
|
||||
def test_predict_shape():
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
|
||||
X, y = fetch_california_housing(return_X_y=True)
|
||||
reg = xgb.XGBRegressor(n_estimators=1)
|
||||
reg.fit(X, y)
|
||||
@@ -119,13 +123,14 @@ def test_predict_shape():
|
||||
|
||||
|
||||
class TestInplacePredict:
|
||||
'''Tests for running inplace prediction'''
|
||||
"""Tests for running inplace prediction"""
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
cls.rows = 1000
|
||||
cls.cols = 10
|
||||
|
||||
cls.missing = 11 # set to integer for testing
|
||||
cls.missing = 11 # set to integer for testing
|
||||
|
||||
cls.rng = np.random.RandomState(1994)
|
||||
|
||||
@@ -139,7 +144,7 @@ class TestInplacePredict:
|
||||
cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing)
|
||||
|
||||
cls.num_boost_round = 10
|
||||
cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10)
|
||||
cls.booster = xgb.train({"tree_method": "hist"}, dtrain, num_boost_round=10)
|
||||
|
||||
def test_predict(self):
|
||||
booster = self.booster
|
||||
@@ -162,28 +167,22 @@ class TestInplacePredict:
|
||||
predt_from_array = booster.inplace_predict(
|
||||
X[:10, ...], iteration_range=(0, 4), missing=self.missing
|
||||
)
|
||||
predt_from_dmatrix = booster.predict(test, ntree_limit=4)
|
||||
predt_from_dmatrix = booster.predict(test, iteration_range=(0, 4))
|
||||
|
||||
np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
booster.predict(test, ntree_limit=booster.best_ntree_limit + 1)
|
||||
with pytest.raises(ValueError):
|
||||
booster.predict(test, iteration_range=(0, booster.best_iteration + 2))
|
||||
|
||||
default = booster.predict(test)
|
||||
|
||||
range_full = booster.predict(test, iteration_range=(0, self.num_boost_round))
|
||||
ntree_full = booster.predict(test, ntree_limit=self.num_boost_round)
|
||||
np.testing.assert_allclose(range_full, default)
|
||||
np.testing.assert_allclose(ntree_full, default)
|
||||
|
||||
range_full = booster.predict(
|
||||
test, iteration_range=(0, booster.best_iteration + 1)
|
||||
)
|
||||
ntree_full = booster.predict(test, ntree_limit=booster.best_ntree_limit)
|
||||
np.testing.assert_allclose(range_full, default)
|
||||
np.testing.assert_allclose(ntree_full, default)
|
||||
|
||||
def predict_dense(x):
|
||||
inplace_predt = booster.inplace_predict(x)
|
||||
@@ -251,6 +250,7 @@ class TestInplacePredict:
|
||||
@pytest.mark.skipif(**tm.no_pandas())
|
||||
def test_pd_dtypes(self) -> None:
|
||||
from pandas.api.types import is_bool_dtype
|
||||
|
||||
for orig, x in pd_dtypes():
|
||||
dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
|
||||
if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]):
|
||||
|
||||
@@ -60,7 +60,7 @@ def test_ranking_with_weighted_data():
|
||||
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
|
||||
|
||||
for i in range(1, 11):
|
||||
pred = bst.predict(dtrain, ntree_limit=i)
|
||||
pred = bst.predict(dtrain, iteration_range=(0, i))
|
||||
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
|
||||
is_sorted = []
|
||||
for k in range(0, 20, 5):
|
||||
|
||||
@@ -95,44 +95,39 @@ class TestTrainingContinuation:
|
||||
res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
|
||||
assert res1 == res2
|
||||
|
||||
gbdt_04 = xgb.train(xgb_params_02, dtrain_2class,
|
||||
num_boost_round=3)
|
||||
assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration +
|
||||
1) * self.num_parallel_tree
|
||||
|
||||
gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=3)
|
||||
res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
|
||||
res2 = mean_squared_error(y_2class,
|
||||
gbdt_04.predict(
|
||||
dtrain_2class,
|
||||
ntree_limit=gbdt_04.best_ntree_limit))
|
||||
res2 = mean_squared_error(
|
||||
y_2class,
|
||||
gbdt_04.predict(
|
||||
dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
|
||||
)
|
||||
)
|
||||
assert res1 == res2
|
||||
|
||||
gbdt_04 = xgb.train(xgb_params_02, dtrain_2class,
|
||||
num_boost_round=7, xgb_model=gbdt_04)
|
||||
assert gbdt_04.best_ntree_limit == (
|
||||
gbdt_04.best_iteration + 1) * self.num_parallel_tree
|
||||
|
||||
gbdt_04 = xgb.train(
|
||||
xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04
|
||||
)
|
||||
res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
|
||||
res2 = mean_squared_error(y_2class,
|
||||
gbdt_04.predict(
|
||||
dtrain_2class,
|
||||
ntree_limit=gbdt_04.best_ntree_limit))
|
||||
res2 = mean_squared_error(
|
||||
y_2class,
|
||||
gbdt_04.predict(
|
||||
dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
|
||||
)
|
||||
)
|
||||
assert res1 == res2
|
||||
|
||||
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
|
||||
num_boost_round=7)
|
||||
assert gbdt_05.best_ntree_limit == (
|
||||
gbdt_05.best_iteration + 1) * self.num_parallel_tree
|
||||
gbdt_05 = xgb.train(xgb_params_03,
|
||||
dtrain_5class,
|
||||
num_boost_round=3,
|
||||
xgb_model=gbdt_05)
|
||||
assert gbdt_05.best_ntree_limit == (
|
||||
gbdt_05.best_iteration + 1) * self.num_parallel_tree
|
||||
|
||||
res1 = gbdt_05.predict(dtrain_5class)
|
||||
res2 = gbdt_05.predict(dtrain_5class,
|
||||
ntree_limit=gbdt_05.best_ntree_limit)
|
||||
res2 = gbdt_05.predict(
|
||||
dtrain_5class, iteration_range=(0, gbdt_05.best_iteration + 1)
|
||||
)
|
||||
np.testing.assert_almost_equal(res1, res2)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
|
||||
@@ -13,9 +13,9 @@ except Exception:
|
||||
pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")
|
||||
|
||||
|
||||
# Check integration is not broken from xgboost side
|
||||
# Changes in binary format may cause problems
|
||||
def test_with_shap():
|
||||
# xgboost removed ntree_limit in 2.0, which breaks the SHAP package.
|
||||
@pytest.mark.xfail
|
||||
def test_with_shap() -> None:
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
|
||||
X, y = fetch_california_housing(return_X_y=True)
|
||||
|
||||
@@ -63,9 +63,15 @@ def test_multiclass_classification(objective):
|
||||
assert xgb_model.get_booster().num_boosted_rounds() == 100
|
||||
preds = xgb_model.predict(X[test_index])
|
||||
# test other params in XGBClassifier().fit
|
||||
preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
|
||||
preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
|
||||
preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
|
||||
preds2 = xgb_model.predict(
|
||||
X[test_index], output_margin=True, iteration_range=(0, 1)
|
||||
)
|
||||
preds3 = xgb_model.predict(
|
||||
X[test_index], output_margin=True, iteration_range=None
|
||||
)
|
||||
preds4 = xgb_model.predict(
|
||||
X[test_index], output_margin=False, iteration_range=(0, 1)
|
||||
)
|
||||
labels = y[test_index]
|
||||
|
||||
check_pred(preds, labels, output_margin=False)
|
||||
@@ -86,25 +92,21 @@ def test_multiclass_classification(objective):
|
||||
assert proba.shape[1] == cls.n_classes_
|
||||
|
||||
|
||||
def test_best_ntree_limit():
|
||||
def test_best_iteration():
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
|
||||
def train(booster, forest):
|
||||
def train(booster: str, forest: Optional[int]) -> None:
|
||||
rounds = 4
|
||||
cls = xgb.XGBClassifier(
|
||||
n_estimators=rounds, num_parallel_tree=forest, booster=booster
|
||||
).fit(
|
||||
X, y, eval_set=[(X, y)], early_stopping_rounds=3
|
||||
)
|
||||
assert cls.best_iteration == rounds - 1
|
||||
|
||||
if forest:
|
||||
assert cls.best_ntree_limit == rounds * forest
|
||||
else:
|
||||
assert cls.best_ntree_limit == 0
|
||||
|
||||
# best_ntree_limit is used by default, assert that under gblinear it's
|
||||
# best_iteration is used by default, assert that under gblinear it's
|
||||
# automatically ignored due to being 0.
|
||||
cls.predict(X)
|
||||
|
||||
@@ -430,12 +432,15 @@ def test_regression():
|
||||
|
||||
preds = xgb_model.predict(X[test_index])
|
||||
# test other params in XGBRegressor().fit
|
||||
preds2 = xgb_model.predict(X[test_index], output_margin=True,
|
||||
ntree_limit=3)
|
||||
preds3 = xgb_model.predict(X[test_index], output_margin=True,
|
||||
ntree_limit=0)
|
||||
preds4 = xgb_model.predict(X[test_index], output_margin=False,
|
||||
ntree_limit=3)
|
||||
preds2 = xgb_model.predict(
|
||||
X[test_index], output_margin=True, iteration_range=(0, 3)
|
||||
)
|
||||
preds3 = xgb_model.predict(
|
||||
X[test_index], output_margin=True, iteration_range=None
|
||||
)
|
||||
preds4 = xgb_model.predict(
|
||||
X[test_index], output_margin=False, iteration_range=(0, 3)
|
||||
)
|
||||
labels = y[test_index]
|
||||
|
||||
assert mean_squared_error(preds, labels) < 25
|
||||
|
||||
Reference in New Issue
Block a user