Remove ntree limit in python package. (#8345)

- Remove `ntree_limit`. The parameter has been deprecated since 1.4.0. - The SHAP package compatibility is broken.
2023-03-31 19:01:55 +08:00
parent b647403baa
commit bac22734fb
17 changed files with 284 additions and 357 deletions
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -64,7 +64,7 @@ class TestModels:
        num_round = 2
        bst = xgb.train(param, dtrain, num_round, watchlist)
        # this is prediction
-        preds = bst.predict(dtest, ntree_limit=num_round)
+        preds = bst.predict(dtest, iteration_range=(0, num_round))
        labels = dtest.get_label()
        err = sum(1 for i in range(len(preds))
                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
@@ -83,7 +83,7 @@ class TestModels:
            bst2 = xgb.Booster(params=param, model_file=model_path)
            dtest2 = xgb.DMatrix(dtest_path)

-        preds2 = bst2.predict(dtest2, ntree_limit=num_round)
+        preds2 = bst2.predict(dtest2, iteration_range=(0, num_round))

        # assert they are the same
        assert np.sum(np.abs(preds2 - preds)) == 0
@@ -96,7 +96,7 @@ class TestModels:
        # check whether custom evaluation metrics work
        bst = xgb.train(param, dtrain, num_round, watchlist,
                        feval=my_logloss)
-        preds3 = bst.predict(dtest, ntree_limit=num_round)
+        preds3 = bst.predict(dtest, iteration_range=(0, num_round))
        assert all(preds3 == preds)

        # check whether sample_type and normalize_type work
@@ -110,7 +110,7 @@ class TestModels:
            param['sample_type'] = p[0]
            param['normalize_type'] = p[1]
            bst = xgb.train(param, dtrain, num_round, watchlist)
-            preds = bst.predict(dtest, ntree_limit=num_round)
+            preds = bst.predict(dtest, iteration_range=(0, num_round))
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1
@@ -472,8 +472,8 @@ class TestModels:
        X, y = load_iris(return_X_y=True)
        cls = xgb.XGBClassifier(n_estimators=2)
        cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
-        assert cls.get_booster().best_ntree_limit == 2
-        assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
+        assert cls.get_booster().best_iteration == cls.n_estimators - 1
+        assert cls.best_iteration == cls.get_booster().best_iteration

        with tempfile.TemporaryDirectory() as tmpdir:
            path = os.path.join(tmpdir, "cls.json")
@@ -481,8 +481,8 @@ class TestModels:

            cls = xgb.XGBClassifier(n_estimators=2)
            cls.load_model(path)
-            assert cls.get_booster().best_ntree_limit == 2
-            assert cls.best_ntree_limit == cls.get_booster().best_ntree_limit
+            assert cls.get_booster().best_iteration == cls.n_estimators - 1
+            assert cls.best_iteration == cls.get_booster().best_iteration

    def run_slice(
        self,
--- a/tests/python/test_cli.py
+++ b/tests/python/test_cli.py
@@ -102,7 +102,6 @@ eval[test] = {data_path}
            booster.feature_names = None
            booster.feature_types = None
            booster.set_attr(best_iteration=None)
-            booster.set_attr(best_ntree_limit=None)

            booster.save_model(model_out_py)
            py_predt = booster.predict(data)
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -1,4 +1,4 @@
-'''Tests for running inplace prediction.'''
+"""Tests for running inplace prediction."""
 from concurrent.futures import ThreadPoolExecutor

 import numpy as np
@@ -17,10 +17,10 @@ def run_threaded_predict(X, rows, predict_func):
    per_thread = 20
    with ThreadPoolExecutor(max_workers=10) as e:
        for i in range(0, rows, int(rows / per_thread)):
-            if hasattr(X, 'iloc'):
-                predictor = X.iloc[i:i+per_thread, :]
+            if hasattr(X, "iloc"):
+                predictor = X.iloc[i : i + per_thread, :]
            else:
-                predictor = X[i:i+per_thread, ...]
+                predictor = X[i : i + per_thread, ...]
            f = e.submit(predict_func, predictor)
            results.append(f)

@@ -61,27 +61,31 @@ def run_predict_leaf(predictor):

    validate_leaf_output(leaf, num_parallel_tree)

-    ntree_limit = 2
+    n_iters = 2
    sliced = booster.predict(
-        m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit, strict_shape=True
+        m,
+        pred_leaf=True,
+        iteration_range=(0, n_iters),
+        strict_shape=True,
    )
    first = sliced[0, ...]

-    assert np.prod(first.shape) == classes * num_parallel_tree * ntree_limit
+    assert np.prod(first.shape) == classes * num_parallel_tree * n_iters

    # When there's only 1 tree, the output is a 1 dim vector
    booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
-    assert booster.predict(m, pred_leaf=True).shape == (rows, )
+    assert booster.predict(m, pred_leaf=True).shape == (rows,)

    return leaf


 def test_predict_leaf():
-    run_predict_leaf('cpu_predictor')
+    run_predict_leaf("cpu_predictor")


 def test_predict_shape():
    from sklearn.datasets import fetch_california_housing
+
    X, y = fetch_california_housing(return_X_y=True)
    reg = xgb.XGBRegressor(n_estimators=1)
    reg.fit(X, y)
@@ -119,13 +123,14 @@ def test_predict_shape():


 class TestInplacePredict:
-    '''Tests for running inplace prediction'''
+    """Tests for running inplace prediction"""
+
    @classmethod
    def setup_class(cls):
        cls.rows = 1000
        cls.cols = 10

-        cls.missing = 11            # set to integer for testing
+        cls.missing = 11  # set to integer for testing

        cls.rng = np.random.RandomState(1994)

@@ -139,7 +144,7 @@ class TestInplacePredict:
        cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing)

        cls.num_boost_round = 10
-        cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10)
+        cls.booster = xgb.train({"tree_method": "hist"}, dtrain, num_boost_round=10)

    def test_predict(self):
        booster = self.booster
@@ -162,28 +167,22 @@ class TestInplacePredict:
        predt_from_array = booster.inplace_predict(
            X[:10, ...], iteration_range=(0, 4), missing=self.missing
        )
-        predt_from_dmatrix = booster.predict(test, ntree_limit=4)
+        predt_from_dmatrix = booster.predict(test, iteration_range=(0, 4))

        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)

-        with pytest.raises(ValueError):
-            booster.predict(test, ntree_limit=booster.best_ntree_limit + 1)
        with pytest.raises(ValueError):
            booster.predict(test, iteration_range=(0, booster.best_iteration + 2))

        default = booster.predict(test)

        range_full = booster.predict(test, iteration_range=(0, self.num_boost_round))
-        ntree_full = booster.predict(test, ntree_limit=self.num_boost_round)
        np.testing.assert_allclose(range_full, default)
-        np.testing.assert_allclose(ntree_full, default)

        range_full = booster.predict(
            test, iteration_range=(0, booster.best_iteration + 1)
        )
-        ntree_full = booster.predict(test, ntree_limit=booster.best_ntree_limit)
        np.testing.assert_allclose(range_full, default)
-        np.testing.assert_allclose(ntree_full, default)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
@@ -251,6 +250,7 @@ class TestInplacePredict:
    @pytest.mark.skipif(**tm.no_pandas())
    def test_pd_dtypes(self) -> None:
        from pandas.api.types import is_bool_dtype
+
        for orig, x in pd_dtypes():
            dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
            if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]):
--- a/tests/python/test_ranking.py
+++ b/tests/python/test_ranking.py
@@ -60,7 +60,7 @@ def test_ranking_with_weighted_data():
    assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))

    for i in range(1, 11):
-        pred = bst.predict(dtrain, ntree_limit=i)
+        pred = bst.predict(dtrain, iteration_range=(0, i))
        # is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
        is_sorted = []
        for k in range(0, 20, 5):
--- a/tests/python/test_training_continuation.py
+++ b/tests/python/test_training_continuation.py
@@ -95,44 +95,39 @@ class TestTrainingContinuation:
        res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
        assert res1 == res2

-        gbdt_04 = xgb.train(xgb_params_02, dtrain_2class,
-                            num_boost_round=3)
-        assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration +
-                                            1) * self.num_parallel_tree
-
+        gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=3)
        res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
-        res2 = mean_squared_error(y_2class,
-                                  gbdt_04.predict(
-                                      dtrain_2class,
-                                      ntree_limit=gbdt_04.best_ntree_limit))
+        res2 = mean_squared_error(
+            y_2class,
+            gbdt_04.predict(
+                dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
+            )
+        )
        assert res1 == res2

-        gbdt_04 = xgb.train(xgb_params_02, dtrain_2class,
-                            num_boost_round=7, xgb_model=gbdt_04)
-        assert gbdt_04.best_ntree_limit == (
-            gbdt_04.best_iteration + 1) * self.num_parallel_tree
-
+        gbdt_04 = xgb.train(
+            xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04
+        )
        res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
-        res2 = mean_squared_error(y_2class,
-                                  gbdt_04.predict(
-                                      dtrain_2class,
-                                      ntree_limit=gbdt_04.best_ntree_limit))
+        res2 = mean_squared_error(
+            y_2class,
+            gbdt_04.predict(
+                dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
+            )
+        )
        assert res1 == res2

        gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
                            num_boost_round=7)
-        assert gbdt_05.best_ntree_limit == (
-            gbdt_05.best_iteration + 1) * self.num_parallel_tree
        gbdt_05 = xgb.train(xgb_params_03,
                            dtrain_5class,
                            num_boost_round=3,
                            xgb_model=gbdt_05)
-        assert gbdt_05.best_ntree_limit == (
-            gbdt_05.best_iteration + 1) * self.num_parallel_tree

        res1 = gbdt_05.predict(dtrain_5class)
-        res2 = gbdt_05.predict(dtrain_5class,
-                               ntree_limit=gbdt_05.best_ntree_limit)
+        res2 = gbdt_05.predict(
+            dtrain_5class, iteration_range=(0, gbdt_05.best_iteration + 1)
+        )
        np.testing.assert_almost_equal(res1, res2)

    @pytest.mark.skipif(**tm.no_sklearn())
--- a/tests/python/test_with_shap.py
+++ b/tests/python/test_with_shap.py
@@ -13,9 +13,9 @@ except Exception:
 pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")


-# Check integration is not broken from xgboost side
-# Changes in binary format may cause problems
-def test_with_shap():
+# xgboost removed ntree_limit in 2.0, which breaks the SHAP package.
+@pytest.mark.xfail
+def test_with_shap() -> None:
    from sklearn.datasets import fetch_california_housing

    X, y = fetch_california_housing(return_X_y=True)
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -63,9 +63,15 @@ def test_multiclass_classification(objective):
        assert xgb_model.get_booster().num_boosted_rounds() == 100
        preds = xgb_model.predict(X[test_index])
        # test other params in XGBClassifier().fit
-        preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
-        preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
-        preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
+        preds2 = xgb_model.predict(
+            X[test_index], output_margin=True, iteration_range=(0, 1)
+        )
+        preds3 = xgb_model.predict(
+            X[test_index], output_margin=True, iteration_range=None
+        )
+        preds4 = xgb_model.predict(
+            X[test_index], output_margin=False, iteration_range=(0, 1)
+        )
        labels = y[test_index]

        check_pred(preds, labels, output_margin=False)
@@ -86,25 +92,21 @@ def test_multiclass_classification(objective):
    assert proba.shape[1] == cls.n_classes_


-def test_best_ntree_limit():
+def test_best_iteration():
    from sklearn.datasets import load_iris

    X, y = load_iris(return_X_y=True)

-    def train(booster, forest):
+    def train(booster: str, forest: Optional[int]) -> None:
        rounds = 4
        cls = xgb.XGBClassifier(
            n_estimators=rounds, num_parallel_tree=forest, booster=booster
        ).fit(
            X, y, eval_set=[(X, y)], early_stopping_rounds=3
        )
+        assert cls.best_iteration == rounds - 1

-        if forest:
-            assert cls.best_ntree_limit == rounds * forest
-        else:
-            assert cls.best_ntree_limit == 0
-
-        # best_ntree_limit is used by default, assert that under gblinear it's
+        # best_iteration is used by default, assert that under gblinear it's
        # automatically ignored due to being 0.
        cls.predict(X)

@@ -430,12 +432,15 @@ def test_regression():

        preds = xgb_model.predict(X[test_index])
        # test other params in XGBRegressor().fit
-        preds2 = xgb_model.predict(X[test_index], output_margin=True,
-                                   ntree_limit=3)
-        preds3 = xgb_model.predict(X[test_index], output_margin=True,
-                                   ntree_limit=0)
-        preds4 = xgb_model.predict(X[test_index], output_margin=False,
-                                   ntree_limit=3)
+        preds2 = xgb_model.predict(
+            X[test_index], output_margin=True, iteration_range=(0, 3)
+        )
+        preds3 = xgb_model.predict(
+            X[test_index], output_margin=True, iteration_range=None
+        )
+        preds4 = xgb_model.predict(
+            X[test_index], output_margin=False, iteration_range=(0, 3)
+        )
        labels = y[test_index]

        assert mean_squared_error(preds, labels) < 25