Remove ntree limit in python package. (#8345)

- Remove `ntree_limit`. The parameter has been deprecated since 1.4.0. - The SHAP package compatibility is broken.
2023-03-31 19:01:55 +08:00
parent b647403baa
commit bac22734fb
17 changed files with 284 additions and 357 deletions
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -23,13 +23,7 @@ from typing import (
 import numpy

 from . import collective
-from .core import (
-    Booster,
-    DMatrix,
-    XGBoostError,
-    _get_booster_layer_trees,
-    _parse_eval_str,
-)
+from .core import Booster, DMatrix, XGBoostError, _parse_eval_str

 __all__ = [
    "TrainingCallback",
@@ -177,22 +171,14 @@ class CallbackContainer:
                assert isinstance(model, Booster), msg

        if not self.is_cv:
-            num_parallel_tree, _ = _get_booster_layer_trees(model)
            if model.attr("best_score") is not None:
                model.best_score = float(cast(str, model.attr("best_score")))
                model.best_iteration = int(cast(str, model.attr("best_iteration")))
-                # num_class is handled internally
-                model.set_attr(
-                    best_ntree_limit=str((model.best_iteration + 1) * num_parallel_tree)
-                )
-                model.best_ntree_limit = int(cast(str, model.attr("best_ntree_limit")))
            else:
                # Due to compatibility with version older than 1.4, these attributes are
                # added to Python object even if early stopping is not used.
                model.best_iteration = model.num_boosted_rounds() - 1
                model.set_attr(best_iteration=str(model.best_iteration))
-                model.best_ntree_limit = (model.best_iteration + 1) * num_parallel_tree
-                model.set_attr(best_ntree_limit=str(model.best_ntree_limit))

        return model

--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -126,25 +126,6 @@ def _parse_eval_str(result: str) -> List[Tuple[str, float]]:
 IterRange = TypeVar("IterRange", Optional[Tuple[int, int]], Tuple[int, int])


-def _convert_ntree_limit(
-    booster: "Booster", ntree_limit: Optional[int], iteration_range: IterRange
-) -> IterRange:
-    if ntree_limit is not None and ntree_limit != 0:
-        warnings.warn(
-            "ntree_limit is deprecated, use `iteration_range` or model "
-            "slicing instead.",
-            UserWarning,
-        )
-        if iteration_range is not None and iteration_range[1] != 0:
-            raise ValueError(
-                "Only one of `iteration_range` and `ntree_limit` can be non zero."
-            )
-        num_parallel_tree, _ = _get_booster_layer_trees(booster)
-        num_parallel_tree = max([num_parallel_tree, 1])
-        iteration_range = (0, ntree_limit // num_parallel_tree)
-    return iteration_range
-
-
 def _expect(expectations: Sequence[Type], got: Type) -> str:
    """Translate input error into string.

@@ -1508,41 +1489,6 @@ Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
 Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]


-def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
-    """Get number of trees added to booster per-iteration.  This function will be removed
-    once `best_ntree_limit` is dropped in favor of `best_iteration`.  Returns
-    `num_parallel_tree` and `num_groups`.
-
-    """
-    config = json.loads(model.save_config())
-    booster = config["learner"]["gradient_booster"]["name"]
-    if booster == "gblinear":
-        num_parallel_tree = 0
-    elif booster == "dart":
-        num_parallel_tree = int(
-            config["learner"]["gradient_booster"]["gbtree"]["gbtree_model_param"][
-                "num_parallel_tree"
-            ]
-        )
-    elif booster == "gbtree":
-        try:
-            num_parallel_tree = int(
-                config["learner"]["gradient_booster"]["gbtree_model_param"][
-                    "num_parallel_tree"
-                ]
-            )
-        except KeyError:
-            num_parallel_tree = int(
-                config["learner"]["gradient_booster"]["gbtree_train_param"][
-                    "num_parallel_tree"
-                ]
-            )
-    else:
-        raise ValueError(f"Unknown booster: {booster}")
-    num_groups = int(config["learner"]["learner_model_param"]["num_class"])
-    return num_parallel_tree, num_groups
-
-
 def _configure_metrics(params: BoosterParam) -> BoosterParam:
    if (
        isinstance(params, dict)
@@ -1576,11 +1522,11 @@ class Booster:
        """
        Parameters
        ----------
-        params : dict
+        params :
            Parameters for boosters.
-        cache : list
+        cache :
            List of cache items.
-        model_file : string/os.PathLike/Booster/bytearray
+        model_file :
            Path to the model file if it's string or PathLike.
        """
        cache = cache if cache is not None else []
@@ -2100,7 +2046,6 @@ class Booster:
        self,
        data: DMatrix,
        output_margin: bool = False,
-        ntree_limit: int = 0,
        pred_leaf: bool = False,
        pred_contribs: bool = False,
        approx_contribs: bool = False,
@@ -2127,9 +2072,6 @@ class Booster:
        output_margin :
            Whether to output the raw untransformed margin value.

-        ntree_limit :
-            Deprecated, use `iteration_range` instead.
-
        pred_leaf :
            When this option is on, the output will be a matrix of (nsample,
            ntrees) with each record indicating the predicted leaf index of
@@ -2196,7 +2138,6 @@ class Booster:
            raise TypeError("Expecting data to be a DMatrix object, got: ", type(data))
        if validate_features:
            self._validate_dmatrix_features(data)
-        iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range)
        args = {
            "type": 0,
            "training": training,
@@ -2522,8 +2463,6 @@ class Booster:
            self.best_iteration = int(self.attr("best_iteration"))  # type: ignore
        if self.attr("best_score") is not None:
            self.best_score = float(self.attr("best_score"))  # type: ignore
-        if self.attr("best_ntree_limit") is not None:
-            self.best_ntree_limit = int(self.attr("best_ntree_limit"))  # type: ignore

    def num_boosted_rounds(self) -> int:
        """Get number of boosted rounds.  For gblinear this is reset to 0 after
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -1653,14 +1653,11 @@ class DaskScikitLearnBase(XGBModel):
        self,
        X: _DataT,
        output_margin: bool = False,
-        ntree_limit: Optional[int] = None,
        validate_features: bool = True,
        base_margin: Optional[_DaskCollection] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> Any:
        _assert_dask_support()
-        msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
-        assert ntree_limit is None, msg
        return self.client.sync(
            self._predict_async,
            X,
@@ -1694,12 +1691,9 @@ class DaskScikitLearnBase(XGBModel):
    def apply(
        self,
        X: _DataT,
-        ntree_limit: Optional[int] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> Any:
        _assert_dask_support()
-        msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
-        assert ntree_limit is None, msg
        return self.client.sync(self._apply_async, X, iteration_range=iteration_range)

    def __await__(self) -> Awaitable[Any]:
@@ -1993,14 +1987,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
    def predict_proba(
        self,
        X: _DaskCollection,
-        ntree_limit: Optional[int] = None,
        validate_features: bool = True,
        base_margin: Optional[_DaskCollection] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> Any:
        _assert_dask_support()
-        msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
-        assert ntree_limit is None, msg
        return self._client_sync(
            self._predict_proba_async,
            X=X,
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -36,7 +36,6 @@ from .core import (
    Objective,
    QuantileDMatrix,
    XGBoostError,
-    _convert_ntree_limit,
    _deprecate_positional_args,
    _parse_eval_str,
 )
@@ -391,8 +390,7 @@ __model_doc = f"""
          metric will be used for early stopping.

        - If early stopping occurs, the model will have three additional fields:
-          :py:attr:`best_score`, :py:attr:`best_iteration` and
-          :py:attr:`best_ntree_limit`.
+          :py:attr:`best_score`, :py:attr:`best_iteration`.

        .. note::

@@ -1117,7 +1115,6 @@ class XGBModel(XGBModelBase):
        self,
        X: ArrayLike,
        output_margin: bool = False,
-        ntree_limit: Optional[int] = None,
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
@@ -1135,8 +1132,6 @@ class XGBModel(XGBModelBase):
            Data to predict with.
        output_margin :
            Whether to output the raw untransformed margin value.
-        ntree_limit :
-            Deprecated, use `iteration_range` instead.
        validate_features :
            When this is True, validate that the Booster's and data's feature_names are
            identical.  Otherwise, it is assumed that the feature_names are the same.
@@ -1156,9 +1151,6 @@ class XGBModel(XGBModelBase):

        """
        with config_context(verbosity=self.verbosity):
-            iteration_range = _convert_ntree_limit(
-                self.get_booster(), ntree_limit, iteration_range
-            )
            iteration_range = self._get_iteration_range(iteration_range)
            if self._can_use_inplace_predict():
                try:
@@ -1197,7 +1189,6 @@ class XGBModel(XGBModelBase):
    def apply(
        self,
        X: ArrayLike,
-        ntree_limit: int = 0,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> np.ndarray:
        """Return the predicted leaf every tree for each sample. If the model is trained
@@ -1211,9 +1202,6 @@ class XGBModel(XGBModelBase):
        iteration_range :
            See :py:meth:`predict`.

-        ntree_limit :
-            Deprecated, use ``iteration_range`` instead.
-
        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
@@ -1223,9 +1211,6 @@ class XGBModel(XGBModelBase):

        """
        with config_context(verbosity=self.verbosity):
-            iteration_range = _convert_ntree_limit(
-                self.get_booster(), ntree_limit, iteration_range
-            )
            iteration_range = self._get_iteration_range(iteration_range)
            test_dmatrix = DMatrix(
                X,
@@ -1309,10 +1294,6 @@ class XGBModel(XGBModelBase):
        """
        return int(self._early_stopping_attr("best_iteration"))

-    @property
-    def best_ntree_limit(self) -> int:
-        return int(self._early_stopping_attr("best_ntree_limit"))
-
    @property
    def feature_importances_(self) -> np.ndarray:
        """Feature importances property, return depends on `importance_type`
@@ -1562,7 +1543,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
        self,
        X: ArrayLike,
        output_margin: bool = False,
-        ntree_limit: Optional[int] = None,
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
@@ -1571,7 +1551,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
            class_probs = super().predict(
                X=X,
                output_margin=output_margin,
-                ntree_limit=ntree_limit,
                validate_features=validate_features,
                base_margin=base_margin,
                iteration_range=iteration_range,
@@ -1599,7 +1578,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    def predict_proba(
        self,
        X: ArrayLike,
-        ntree_limit: Optional[int] = None,
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
@@ -1614,8 +1592,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
        ----------
        X : array_like
            Feature matrix. See :ref:`py-data` for a list of supported types.
-        ntree_limit : int
-            Deprecated, use `iteration_range` instead.
        validate_features : bool
            When this is True, validate that the Booster's and data's feature_names are
            identical.  Otherwise, it is assumed that the feature_names are the same.
@@ -1642,7 +1618,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
        if self.objective == "multi:softmax":
            raw_predt = super().predict(
                X=X,
-                ntree_limit=ntree_limit,
                validate_features=validate_features,
                base_margin=base_margin,
                iteration_range=iteration_range,
@@ -1652,7 +1627,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
            return class_prob
        class_probs = super().predict(
            X=X,
-            ntree_limit=ntree_limit,
            validate_features=validate_features,
            base_margin=base_margin,
            iteration_range=iteration_range,
@@ -2074,7 +2048,6 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        self,
        X: ArrayLike,
        output_margin: bool = False,
-        ntree_limit: Optional[int] = None,
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
@@ -2083,20 +2056,18 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        return super().predict(
            X,
            output_margin,
-            ntree_limit,
            validate_features,
            base_margin,
-            iteration_range,
+            iteration_range=iteration_range,
        )

    def apply(
        self,
        X: ArrayLike,
-        ntree_limit: int = 0,
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> ArrayLike:
        X, _ = _get_qid(X, None)
-        return super().apply(X, ntree_limit, iteration_range)
+        return super().apply(X, iteration_range)

    def score(self, X: ArrayLike, y: ArrayLike) -> float:
        """Evaluate score for data using the last evaluation metric. If the model is
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -11,7 +11,6 @@ from xgboost import DataIter, DMatrix, QuantileDMatrix, XGBModel
 from xgboost.compat import concat

 from .._typing import ArrayLike
-from ..core import _convert_ntree_limit
 from .utils import get_logger  # type: ignore


@@ -343,8 +342,7 @@ def pred_contribs(
    strict_shape: bool = False,
 ) -> np.ndarray:
    """Predict contributions with data with the full model."""
-    iteration_range = _convert_ntree_limit(model.get_booster(), None, None)
-    iteration_range = model._get_iteration_range(iteration_range)
+    iteration_range = model._get_iteration_range(None)
    data_dmatrix = DMatrix(
        data,
        base_margin=base_margin,