Handle np integer in model slice and prediction. (#10007)

2024-01-26 04:58:48 +08:00
parent a76d6c6131
commit 65d7bf2dfe
7 changed files with 75 additions and 49 deletions
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -36,6 +36,11 @@ PandasDType = Any  # real type is pandas.core.dtypes.base.ExtensionDtype

 FloatCompatible = Union[float, np.float32, np.float64]

+# typing.SupportsInt is not suitable here since floating point values are convertible to
+# integers as well.
+Integer = Union[int, np.integer]
+IterationRange = Tuple[Integer, Integer]
+
 # callables
 FPreProcCallable = Callable

--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -48,6 +48,8 @@ from ._typing import (
    FeatureInfo,
    FeatureNames,
    FeatureTypes,
+    Integer,
+    IterationRange,
    ModelIn,
    NumpyOrCupy,
    TransformedData,
@@ -1812,19 +1814,25 @@ class Booster:
            state["handle"] = handle
        self.__dict__.update(state)

-    def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster":
+    def __getitem__(self, val: Union[Integer, tuple, slice]) -> "Booster":
        """Get a slice of the tree-based model.

        .. versionadded:: 1.3.0

        """
-        if isinstance(val, int):
-            val = slice(val, val + 1)
+        # convert to slice for all other types
+        if isinstance(val, (np.integer, int)):
+            val = slice(int(val), int(val + 1))
+        if isinstance(val, type(Ellipsis)):
+            val = slice(0, 0)
        if isinstance(val, tuple):
            raise ValueError("Only supports slicing through 1 dimension.")
+        # All supported types are now slice
+        # FIXME(jiamingy): Use `types.EllipsisType` once Python 3.10 is used.
        if not isinstance(val, slice):
-            msg = _expect((int, slice), type(val))
+            msg = _expect((int, slice, np.integer, type(Ellipsis)), type(val))
            raise TypeError(msg)
+
        if isinstance(val.start, type(Ellipsis)) or val.start is None:
            start = 0
        else:
@@ -2246,12 +2254,13 @@ class Booster:
        pred_interactions: bool = False,
        validate_features: bool = True,
        training: bool = False,
-        iteration_range: Tuple[int, int] = (0, 0),
+        iteration_range: IterationRange = (0, 0),
        strict_shape: bool = False,
    ) -> np.ndarray:
-        """Predict with data.  The full model will be used unless `iteration_range` is specified,
-        meaning user have to either slice the model or use the ``best_iteration``
-        attribute to get prediction from best model returned from early stopping.
+        """Predict with data.  The full model will be used unless `iteration_range` is
+        specified, meaning user have to either slice the model or use the
+        ``best_iteration`` attribute to get prediction from best model returned from
+        early stopping.

        .. note::

@@ -2336,8 +2345,8 @@ class Booster:
        args = {
            "type": 0,
            "training": training,
-            "iteration_begin": iteration_range[0],
-            "iteration_end": iteration_range[1],
+            "iteration_begin": int(iteration_range[0]),
+            "iteration_end": int(iteration_range[1]),
            "strict_shape": strict_shape,
        }

@@ -2373,7 +2382,7 @@ class Booster:
    def inplace_predict(
        self,
        data: DataType,
-        iteration_range: Tuple[int, int] = (0, 0),
+        iteration_range: IterationRange = (0, 0),
        predict_type: str = "value",
        missing: float = np.nan,
        validate_features: bool = True,
@@ -2439,8 +2448,8 @@ class Booster:
        args = make_jcargs(
            type=1 if predict_type == "margin" else 0,
            training=False,
-            iteration_begin=iteration_range[0],
-            iteration_end=iteration_range[1],
+            iteration_begin=int(iteration_range[0]),
+            iteration_end=int(iteration_range[1]),
            missing=missing,
            strict_shape=strict_shape,
            cache_id=0,
--- a/python-package/xgboost/dask/init.py
+++ b/python-package/xgboost/dask/init.py
@@ -61,7 +61,7 @@ from typing import (
 import numpy

 from xgboost import collective, config
-from xgboost._typing import _T, FeatureNames, FeatureTypes
+from xgboost._typing import _T, FeatureNames, FeatureTypes, IterationRange
 from xgboost.callback import TrainingCallback
 from xgboost.compat import DataFrame, LazyLoader, concat, lazy_isinstance
 from xgboost.core import (
@@ -1263,7 +1263,7 @@ async def _predict_async(
    approx_contribs: bool,
    pred_interactions: bool,
    validate_features: bool,
-    iteration_range: Tuple[int, int],
+    iteration_range: IterationRange,
    strict_shape: bool,
 ) -> _DaskCollection:
    _booster = await _get_model_future(client, model)
@@ -1410,7 +1410,7 @@ def predict(  # pylint: disable=unused-argument
    approx_contribs: bool = False,
    pred_interactions: bool = False,
    validate_features: bool = True,
-    iteration_range: Tuple[int, int] = (0, 0),
+    iteration_range: IterationRange = (0, 0),
    strict_shape: bool = False,
 ) -> Any:
    """Run prediction with a trained booster.
@@ -1458,7 +1458,7 @@ async def _inplace_predict_async(  # pylint: disable=too-many-branches
    global_config: Dict[str, Any],
    model: Union[Booster, Dict, "distributed.Future"],
    data: _DataT,
-    iteration_range: Tuple[int, int],
+    iteration_range: IterationRange,
    predict_type: str,
    missing: float,
    validate_features: bool,
@@ -1516,7 +1516,7 @@ def inplace_predict(  # pylint: disable=unused-argument
    client: Optional["distributed.Client"],
    model: Union[TrainReturnT, Booster, "distributed.Future"],
    data: _DataT,
-    iteration_range: Tuple[int, int] = (0, 0),
+    iteration_range: IterationRange = (0, 0),
    predict_type: str = "value",
    missing: float = numpy.nan,
    validate_features: bool = True,
@@ -1624,7 +1624,7 @@ class DaskScikitLearnBase(XGBModel):
        output_margin: bool,
        validate_features: bool,
        base_margin: Optional[_DaskCollection],
-        iteration_range: Optional[Tuple[int, int]],
+        iteration_range: Optional[IterationRange],
    ) -> Any:
        iteration_range = self._get_iteration_range(iteration_range)
        if self._can_use_inplace_predict():
@@ -1664,7 +1664,7 @@ class DaskScikitLearnBase(XGBModel):
        output_margin: bool = False,
        validate_features: bool = True,
        base_margin: Optional[_DaskCollection] = None,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> Any:
        _assert_dask_support()
        return self.client.sync(
@@ -1679,7 +1679,7 @@ class DaskScikitLearnBase(XGBModel):
    async def _apply_async(
        self,
        X: _DataT,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> Any:
        iteration_range = self._get_iteration_range(iteration_range)
        test_dmatrix = await DaskDMatrix(
@@ -1700,7 +1700,7 @@ class DaskScikitLearnBase(XGBModel):
    def apply(
        self,
        X: _DataT,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> Any:
        _assert_dask_support()
        return self.client.sync(self._apply_async, X, iteration_range=iteration_range)
@@ -1962,7 +1962,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
        X: _DataT,
        validate_features: bool,
        base_margin: Optional[_DaskCollection],
-        iteration_range: Optional[Tuple[int, int]],
+        iteration_range: Optional[IterationRange],
    ) -> _DaskCollection:
        if self.objective == "multi:softmax":
            raise ValueError(
@@ -1987,7 +1987,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
        X: _DaskCollection,
        validate_features: bool = True,
        base_margin: Optional[_DaskCollection] = None,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> Any:
        _assert_dask_support()
        return self._client_sync(
@@ -2006,7 +2006,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
        output_margin: bool,
        validate_features: bool,
        base_margin: Optional[_DaskCollection],
-        iteration_range: Optional[Tuple[int, int]],
+        iteration_range: Optional[IterationRange],
    ) -> _DaskCollection:
        pred_probs = await super()._predict_async(
            data, output_margin, validate_features, base_margin, iteration_range
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -22,7 +22,7 @@ from typing import (
 import numpy as np
 from scipy.special import softmax

-from ._typing import ArrayLike, FeatureNames, FeatureTypes, ModelIn
+from ._typing import ArrayLike, FeatureNames, FeatureTypes, IterationRange, ModelIn
 from .callback import TrainingCallback

 # Do not use class names on scikit-learn directly.  Re-define the classes on
@@ -1039,8 +1039,8 @@ class XGBModel(XGBModelBase):
        return False

    def _get_iteration_range(
-        self, iteration_range: Optional[Tuple[int, int]]
-    ) -> Tuple[int, int]:
+        self, iteration_range: Optional[IterationRange]
+    ) -> IterationRange:
        if iteration_range is None or iteration_range[1] == 0:
            # Use best_iteration if defined.
            try:
@@ -1057,7 +1057,7 @@ class XGBModel(XGBModelBase):
        output_margin: bool = False,
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> ArrayLike:
        """Predict with `X`.  If the model is trained with early stopping, then
        :py:attr:`best_iteration` is used automatically. The estimator uses
@@ -1129,7 +1129,7 @@ class XGBModel(XGBModelBase):
    def apply(
        self,
        X: ArrayLike,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> np.ndarray:
        """Return the predicted leaf every tree for each sample. If the model is trained
        with early stopping, then :py:attr:`best_iteration` is used automatically.
@@ -1465,7 +1465,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        output_margin: bool = False,
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> ArrayLike:
        with config_context(verbosity=self.verbosity):
            class_probs = super().predict(
@@ -1500,7 +1500,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        X: ArrayLike,
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> np.ndarray:
        """Predict the probability of each `X` example being of a given class. If the
        model is trained with early stopping, then :py:attr:`best_iteration` is used
@@ -1942,7 +1942,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        output_margin: bool = False,
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> ArrayLike:
        X, _ = _get_qid(X, None)
        return super().predict(
@@ -1956,7 +1956,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
    def apply(
        self,
        X: ArrayLike,
-        iteration_range: Optional[Tuple[int, int]] = None,
+        iteration_range: Optional[IterationRange] = None,
    ) -> ArrayLike:
        X, _ = _get_qid(X, None)
        return super().apply(X, iteration_range)