Handle np integer in model slice and prediction. (#10007)

This commit is contained in:
Jiaming Yuan
2024-01-26 04:58:48 +08:00
committed by GitHub
parent a76d6c6131
commit 65d7bf2dfe
7 changed files with 75 additions and 49 deletions

View File

@@ -36,6 +36,11 @@ PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype
FloatCompatible = Union[float, np.float32, np.float64]
# typing.SupportsInt is not suitable here since floating point values are convertible to
# integers as well.
Integer = Union[int, np.integer]
IterationRange = Tuple[Integer, Integer]
# callables
FPreProcCallable = Callable

View File

@@ -48,6 +48,8 @@ from ._typing import (
FeatureInfo,
FeatureNames,
FeatureTypes,
Integer,
IterationRange,
ModelIn,
NumpyOrCupy,
TransformedData,
@@ -1812,19 +1814,25 @@ class Booster:
state["handle"] = handle
self.__dict__.update(state)
def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster":
def __getitem__(self, val: Union[Integer, tuple, slice]) -> "Booster":
"""Get a slice of the tree-based model.
.. versionadded:: 1.3.0
"""
if isinstance(val, int):
val = slice(val, val + 1)
# convert to slice for all other types
if isinstance(val, (np.integer, int)):
val = slice(int(val), int(val + 1))
if isinstance(val, type(Ellipsis)):
val = slice(0, 0)
if isinstance(val, tuple):
raise ValueError("Only supports slicing through 1 dimension.")
# All supported types are now slice
# FIXME(jiamingy): Use `types.EllipsisType` once Python 3.10 is used.
if not isinstance(val, slice):
msg = _expect((int, slice), type(val))
msg = _expect((int, slice, np.integer, type(Ellipsis)), type(val))
raise TypeError(msg)
if isinstance(val.start, type(Ellipsis)) or val.start is None:
start = 0
else:
@@ -2246,12 +2254,13 @@ class Booster:
pred_interactions: bool = False,
validate_features: bool = True,
training: bool = False,
iteration_range: Tuple[int, int] = (0, 0),
iteration_range: IterationRange = (0, 0),
strict_shape: bool = False,
) -> np.ndarray:
"""Predict with data. The full model will be used unless `iteration_range` is specified,
meaning user have to either slice the model or use the ``best_iteration``
attribute to get prediction from best model returned from early stopping.
"""Predict with data. The full model will be used unless `iteration_range` is
specified, meaning user have to either slice the model or use the
``best_iteration`` attribute to get prediction from best model returned from
early stopping.
.. note::
@@ -2336,8 +2345,8 @@ class Booster:
args = {
"type": 0,
"training": training,
"iteration_begin": iteration_range[0],
"iteration_end": iteration_range[1],
"iteration_begin": int(iteration_range[0]),
"iteration_end": int(iteration_range[1]),
"strict_shape": strict_shape,
}
@@ -2373,7 +2382,7 @@ class Booster:
def inplace_predict(
self,
data: DataType,
iteration_range: Tuple[int, int] = (0, 0),
iteration_range: IterationRange = (0, 0),
predict_type: str = "value",
missing: float = np.nan,
validate_features: bool = True,
@@ -2439,8 +2448,8 @@ class Booster:
args = make_jcargs(
type=1 if predict_type == "margin" else 0,
training=False,
iteration_begin=iteration_range[0],
iteration_end=iteration_range[1],
iteration_begin=int(iteration_range[0]),
iteration_end=int(iteration_range[1]),
missing=missing,
strict_shape=strict_shape,
cache_id=0,

View File

@@ -61,7 +61,7 @@ from typing import (
import numpy
from xgboost import collective, config
from xgboost._typing import _T, FeatureNames, FeatureTypes
from xgboost._typing import _T, FeatureNames, FeatureTypes, IterationRange
from xgboost.callback import TrainingCallback
from xgboost.compat import DataFrame, LazyLoader, concat, lazy_isinstance
from xgboost.core import (
@@ -1263,7 +1263,7 @@ async def _predict_async(
approx_contribs: bool,
pred_interactions: bool,
validate_features: bool,
iteration_range: Tuple[int, int],
iteration_range: IterationRange,
strict_shape: bool,
) -> _DaskCollection:
_booster = await _get_model_future(client, model)
@@ -1410,7 +1410,7 @@ def predict( # pylint: disable=unused-argument
approx_contribs: bool = False,
pred_interactions: bool = False,
validate_features: bool = True,
iteration_range: Tuple[int, int] = (0, 0),
iteration_range: IterationRange = (0, 0),
strict_shape: bool = False,
) -> Any:
"""Run prediction with a trained booster.
@@ -1458,7 +1458,7 @@ async def _inplace_predict_async( # pylint: disable=too-many-branches
global_config: Dict[str, Any],
model: Union[Booster, Dict, "distributed.Future"],
data: _DataT,
iteration_range: Tuple[int, int],
iteration_range: IterationRange,
predict_type: str,
missing: float,
validate_features: bool,
@@ -1516,7 +1516,7 @@ def inplace_predict( # pylint: disable=unused-argument
client: Optional["distributed.Client"],
model: Union[TrainReturnT, Booster, "distributed.Future"],
data: _DataT,
iteration_range: Tuple[int, int] = (0, 0),
iteration_range: IterationRange = (0, 0),
predict_type: str = "value",
missing: float = numpy.nan,
validate_features: bool = True,
@@ -1624,7 +1624,7 @@ class DaskScikitLearnBase(XGBModel):
output_margin: bool,
validate_features: bool,
base_margin: Optional[_DaskCollection],
iteration_range: Optional[Tuple[int, int]],
iteration_range: Optional[IterationRange],
) -> Any:
iteration_range = self._get_iteration_range(iteration_range)
if self._can_use_inplace_predict():
@@ -1664,7 +1664,7 @@ class DaskScikitLearnBase(XGBModel):
output_margin: bool = False,
validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> Any:
_assert_dask_support()
return self.client.sync(
@@ -1679,7 +1679,7 @@ class DaskScikitLearnBase(XGBModel):
async def _apply_async(
self,
X: _DataT,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> Any:
iteration_range = self._get_iteration_range(iteration_range)
test_dmatrix = await DaskDMatrix(
@@ -1700,7 +1700,7 @@ class DaskScikitLearnBase(XGBModel):
def apply(
self,
X: _DataT,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> Any:
_assert_dask_support()
return self.client.sync(self._apply_async, X, iteration_range=iteration_range)
@@ -1962,7 +1962,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
X: _DataT,
validate_features: bool,
base_margin: Optional[_DaskCollection],
iteration_range: Optional[Tuple[int, int]],
iteration_range: Optional[IterationRange],
) -> _DaskCollection:
if self.objective == "multi:softmax":
raise ValueError(
@@ -1987,7 +1987,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
X: _DaskCollection,
validate_features: bool = True,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> Any:
_assert_dask_support()
return self._client_sync(
@@ -2006,7 +2006,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
output_margin: bool,
validate_features: bool,
base_margin: Optional[_DaskCollection],
iteration_range: Optional[Tuple[int, int]],
iteration_range: Optional[IterationRange],
) -> _DaskCollection:
pred_probs = await super()._predict_async(
data, output_margin, validate_features, base_margin, iteration_range

View File

@@ -22,7 +22,7 @@ from typing import (
import numpy as np
from scipy.special import softmax
from ._typing import ArrayLike, FeatureNames, FeatureTypes, ModelIn
from ._typing import ArrayLike, FeatureNames, FeatureTypes, IterationRange, ModelIn
from .callback import TrainingCallback
# Do not use class names on scikit-learn directly. Re-define the classes on
@@ -1039,8 +1039,8 @@ class XGBModel(XGBModelBase):
return False
def _get_iteration_range(
self, iteration_range: Optional[Tuple[int, int]]
) -> Tuple[int, int]:
self, iteration_range: Optional[IterationRange]
) -> IterationRange:
if iteration_range is None or iteration_range[1] == 0:
# Use best_iteration if defined.
try:
@@ -1057,7 +1057,7 @@ class XGBModel(XGBModelBase):
output_margin: bool = False,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> ArrayLike:
"""Predict with `X`. If the model is trained with early stopping, then
:py:attr:`best_iteration` is used automatically. The estimator uses
@@ -1129,7 +1129,7 @@ class XGBModel(XGBModelBase):
def apply(
self,
X: ArrayLike,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> np.ndarray:
"""Return the predicted leaf every tree for each sample. If the model is trained
with early stopping, then :py:attr:`best_iteration` is used automatically.
@@ -1465,7 +1465,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
output_margin: bool = False,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> ArrayLike:
with config_context(verbosity=self.verbosity):
class_probs = super().predict(
@@ -1500,7 +1500,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
X: ArrayLike,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> np.ndarray:
"""Predict the probability of each `X` example being of a given class. If the
model is trained with early stopping, then :py:attr:`best_iteration` is used
@@ -1942,7 +1942,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
output_margin: bool = False,
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> ArrayLike:
X, _ = _get_qid(X, None)
return super().predict(
@@ -1956,7 +1956,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
def apply(
self,
X: ArrayLike,
iteration_range: Optional[Tuple[int, int]] = None,
iteration_range: Optional[IterationRange] = None,
) -> ArrayLike:
X, _ = _get_qid(X, None)
return super().apply(X, iteration_range)