Remove ntree limit in python package. (#8345)

- Remove `ntree_limit`. The parameter has been deprecated since 1.4.0.
- The SHAP package compatibility is broken.
This commit is contained in:
Jiaming Yuan
2023-03-31 19:01:55 +08:00
committed by GitHub
parent b647403baa
commit bac22734fb
17 changed files with 284 additions and 357 deletions

View File

@@ -23,13 +23,7 @@ from typing import (
import numpy
from . import collective
from .core import (
Booster,
DMatrix,
XGBoostError,
_get_booster_layer_trees,
_parse_eval_str,
)
from .core import Booster, DMatrix, XGBoostError, _parse_eval_str
__all__ = [
"TrainingCallback",
@@ -177,22 +171,14 @@ class CallbackContainer:
assert isinstance(model, Booster), msg
if not self.is_cv:
num_parallel_tree, _ = _get_booster_layer_trees(model)
if model.attr("best_score") is not None:
model.best_score = float(cast(str, model.attr("best_score")))
model.best_iteration = int(cast(str, model.attr("best_iteration")))
# num_class is handled internally
model.set_attr(
best_ntree_limit=str((model.best_iteration + 1) * num_parallel_tree)
)
model.best_ntree_limit = int(cast(str, model.attr("best_ntree_limit")))
else:
# Due to compatibility with version older than 1.4, these attributes are
# added to Python object even if early stopping is not used.
model.best_iteration = model.num_boosted_rounds() - 1
model.set_attr(best_iteration=str(model.best_iteration))
model.best_ntree_limit = (model.best_iteration + 1) * num_parallel_tree
model.set_attr(best_ntree_limit=str(model.best_ntree_limit))
return model

View File

@@ -126,25 +126,6 @@ def _parse_eval_str(result: str) -> List[Tuple[str, float]]:
IterRange = TypeVar("IterRange", Optional[Tuple[int, int]], Tuple[int, int])
def _convert_ntree_limit(
booster: "Booster", ntree_limit: Optional[int], iteration_range: IterRange
) -> IterRange:
if ntree_limit is not None and ntree_limit != 0:
warnings.warn(
"ntree_limit is deprecated, use `iteration_range` or model "
"slicing instead.",
UserWarning,
)
if iteration_range is not None and iteration_range[1] != 0:
raise ValueError(
"Only one of `iteration_range` and `ntree_limit` can be non zero."
)
num_parallel_tree, _ = _get_booster_layer_trees(booster)
num_parallel_tree = max([num_parallel_tree, 1])
iteration_range = (0, ntree_limit // num_parallel_tree)
return iteration_range
def _expect(expectations: Sequence[Type], got: Type) -> str:
"""Translate input error into string.
@@ -1508,41 +1489,6 @@ Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]
def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
"""Get number of trees added to booster per-iteration. This function will be removed
once `best_ntree_limit` is dropped in favor of `best_iteration`. Returns
`num_parallel_tree` and `num_groups`.
"""
config = json.loads(model.save_config())
booster = config["learner"]["gradient_booster"]["name"]
if booster == "gblinear":
num_parallel_tree = 0
elif booster == "dart":
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree"]["gbtree_model_param"][
"num_parallel_tree"
]
)
elif booster == "gbtree":
try:
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree_model_param"][
"num_parallel_tree"
]
)
except KeyError:
num_parallel_tree = int(
config["learner"]["gradient_booster"]["gbtree_train_param"][
"num_parallel_tree"
]
)
else:
raise ValueError(f"Unknown booster: {booster}")
num_groups = int(config["learner"]["learner_model_param"]["num_class"])
return num_parallel_tree, num_groups
def _configure_metrics(params: BoosterParam) -> BoosterParam:
if (
isinstance(params, dict)
@@ -1576,11 +1522,11 @@ class Booster:
"""
Parameters
----------
params : dict
params :
Parameters for boosters.
cache : list
cache :
List of cache items.
model_file : string/os.PathLike/Booster/bytearray
model_file :
Path to the model file if it's string or PathLike.
"""
cache = cache if cache is not None else []
@@ -2100,7 +2046,6 @@ class Booster:
self,
data: DMatrix,
output_margin: bool = False,
ntree_limit: int = 0,
pred_leaf: bool = False,
pred_contribs: bool = False,
approx_contribs: bool = False,
@@ -2127,9 +2072,6 @@ class Booster:
output_margin :
Whether to output the raw untransformed margin value.
ntree_limit :
Deprecated, use `iteration_range` instead.
pred_leaf :
When this option is on, the output will be a matrix of (nsample,
ntrees) with each record indicating the predicted leaf index of
@@ -2196,7 +2138,6 @@ class Booster:
raise TypeError("Expecting data to be a DMatrix object, got: ", type(data))
if validate_features:
self._validate_dmatrix_features(data)
iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range)
args = {
"type": 0,
"training": training,
@@ -2522,8 +2463,6 @@ class Booster:
self.best_iteration = int(self.attr("best_iteration")) # type: ignore
if self.attr("best_score") is not None:
self.best_score = float(self.attr("best_score")) # type: ignore
if self.attr("best_ntree_limit") is not None:
self.best_ntree_limit = int(self.attr("best_ntree_limit")) # type: ignore
def num_boosted_rounds(self) -> int:
"""Get number of boosted rounds. For gblinear this is reset to 0 after

View File

@@ -1653,14 +1653,11 @@ class DaskScikitLearnBase(XGBModel):
self,
X: _DataT,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> Any:
_assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self.client.sync(
self._predict_async,
X,
@@ -1694,12 +1691,9 @@ class DaskScikitLearnBase(XGBModel):
def apply(
self,
X: _DataT,
ntree_limit: Optional[int] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> Any:
_assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self.client.sync(self._apply_async, X, iteration_range=iteration_range)
def __await__(self) -> Awaitable[Any]:
@@ -1993,14 +1987,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
def predict_proba(
self,
X: _DaskCollection,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> Any:
_assert_dask_support()
msg = "`ntree_limit` is not supported on dask, use `iteration_range` instead."
assert ntree_limit is None, msg
return self._client_sync(
self._predict_proba_async,
X=X,

View File

@@ -36,7 +36,6 @@ from .core import (
Objective,
QuantileDMatrix,
XGBoostError,
_convert_ntree_limit,
_deprecate_positional_args,
_parse_eval_str,
)
@@ -391,8 +390,7 @@ __model_doc = f"""
metric will be used for early stopping.
- If early stopping occurs, the model will have three additional fields:
:py:attr:`best_score`, :py:attr:`best_iteration` and
:py:attr:`best_ntree_limit`.
:py:attr:`best_score`, :py:attr:`best_iteration`.
.. note::
@@ -1117,7 +1115,6 @@ class XGBModel(XGBModelBase):
self,
X: ArrayLike,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
@@ -1135,8 +1132,6 @@ class XGBModel(XGBModelBase):
Data to predict with.
output_margin :
Whether to output the raw untransformed margin value.
ntree_limit :
Deprecated, use `iteration_range` instead.
validate_features :
When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same.
@@ -1156,9 +1151,6 @@ class XGBModel(XGBModelBase):
"""
with config_context(verbosity=self.verbosity):
iteration_range = _convert_ntree_limit(
self.get_booster(), ntree_limit, iteration_range
)
iteration_range = self._get_iteration_range(iteration_range)
if self._can_use_inplace_predict():
try:
@@ -1197,7 +1189,6 @@ class XGBModel(XGBModelBase):
def apply(
self,
X: ArrayLike,
ntree_limit: int = 0,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
"""Return the predicted leaf every tree for each sample. If the model is trained
@@ -1211,9 +1202,6 @@ class XGBModel(XGBModelBase):
iteration_range :
See :py:meth:`predict`.
ntree_limit :
Deprecated, use ``iteration_range`` instead.
Returns
-------
X_leaves : array_like, shape=[n_samples, n_trees]
@@ -1223,9 +1211,6 @@ class XGBModel(XGBModelBase):
"""
with config_context(verbosity=self.verbosity):
iteration_range = _convert_ntree_limit(
self.get_booster(), ntree_limit, iteration_range
)
iteration_range = self._get_iteration_range(iteration_range)
test_dmatrix = DMatrix(
X,
@@ -1309,10 +1294,6 @@ class XGBModel(XGBModelBase):
"""
return int(self._early_stopping_attr("best_iteration"))
@property
def best_ntree_limit(self) -> int:
return int(self._early_stopping_attr("best_ntree_limit"))
@property
def feature_importances_(self) -> np.ndarray:
"""Feature importances property, return depends on `importance_type`
@@ -1562,7 +1543,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
self,
X: ArrayLike,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
@@ -1571,7 +1551,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
class_probs = super().predict(
X=X,
output_margin=output_margin,
ntree_limit=ntree_limit,
validate_features=validate_features,
base_margin=base_margin,
iteration_range=iteration_range,
@@ -1599,7 +1578,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
def predict_proba(
self,
X: ArrayLike,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
@@ -1614,8 +1592,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
----------
X : array_like
Feature matrix. See :ref:`py-data` for a list of supported types.
ntree_limit : int
Deprecated, use `iteration_range` instead.
validate_features : bool
When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same.
@@ -1642,7 +1618,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
if self.objective == "multi:softmax":
raw_predt = super().predict(
X=X,
ntree_limit=ntree_limit,
validate_features=validate_features,
base_margin=base_margin,
iteration_range=iteration_range,
@@ -1652,7 +1627,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
return class_prob
class_probs = super().predict(
X=X,
ntree_limit=ntree_limit,
validate_features=validate_features,
base_margin=base_margin,
iteration_range=iteration_range,
@@ -2074,7 +2048,6 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
self,
X: ArrayLike,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
@@ -2083,20 +2056,18 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
return super().predict(
X,
output_margin,
ntree_limit,
validate_features,
base_margin,
iteration_range,
iteration_range=iteration_range,
)
def apply(
self,
X: ArrayLike,
ntree_limit: int = 0,
iteration_range: Optional[Tuple[int, int]] = None,
) -> ArrayLike:
X, _ = _get_qid(X, None)
return super().apply(X, ntree_limit, iteration_range)
return super().apply(X, iteration_range)
def score(self, X: ArrayLike, y: ArrayLike) -> float:
"""Evaluate score for data using the last evaluation metric. If the model is

View File

@@ -11,7 +11,6 @@ from xgboost import DataIter, DMatrix, QuantileDMatrix, XGBModel
from xgboost.compat import concat
from .._typing import ArrayLike
from ..core import _convert_ntree_limit
from .utils import get_logger # type: ignore
@@ -343,8 +342,7 @@ def pred_contribs(
strict_shape: bool = False,
) -> np.ndarray:
"""Predict contributions with data with the full model."""
iteration_range = _convert_ntree_limit(model.get_booster(), None, None)
iteration_range = model._get_iteration_range(iteration_range)
iteration_range = model._get_iteration_range(None)
data_dmatrix = DMatrix(
data,
base_margin=base_margin,