Define best_iteration only if early stopping is used. (#9403)

* Define `best_iteration` only if early stopping is used. This is the behavior specified by the document but not honored in the actual code. - Don't set the attributes if there's no early stopping. - Clean up the code for callbacks, and replace assertions with proper exceptions. - Assign the attributes when early stopping `save_best` is used. - Turn the attributes into Python properties. --------- Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2023-07-24 12:43:35 +08:00
parent 01e00efc53
commit 851cba931e
10 changed files with 249 additions and 179 deletions
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -134,13 +134,17 @@ class CallbackContainer:
        is_cv: bool = False,
    ) -> None:
        self.callbacks = set(callbacks)
-        if metric is not None:
-            msg = (
-                "metric must be callable object for monitoring.  For "
-                + "builtin metrics, passing them in training parameter"
-                + " will invoke monitor automatically."
-            )
-            assert callable(metric), msg
+        for cb in callbacks:
+            if not isinstance(cb, TrainingCallback):
+                raise TypeError("callback must be an instance of `TrainingCallback`.")
+
+        msg = (
+            "metric must be callable object for monitoring.  For builtin metrics"
+            ", passing them in training parameter invokes monitor automatically."
+        )
+        if metric is not None and not callable(metric):
+            raise TypeError(msg)
+
        self.metric = metric
        self.history: TrainingCallback.EvalsLog = collections.OrderedDict()
        self._output_margin = output_margin
@@ -170,16 +174,6 @@ class CallbackContainer:
            else:
                assert isinstance(model, Booster), msg

-        if not self.is_cv:
-            if model.attr("best_score") is not None:
-                model.best_score = float(cast(str, model.attr("best_score")))
-                model.best_iteration = int(cast(str, model.attr("best_iteration")))
-            else:
-                # Due to compatibility with version older than 1.4, these attributes are
-                # added to Python object even if early stopping is not used.
-                model.best_iteration = model.num_boosted_rounds() - 1
-                model.set_attr(best_iteration=str(model.best_iteration))
-
        return model

    def before_iteration(
@@ -267,9 +261,14 @@ class LearningRateScheduler(TrainingCallback):
    def __init__(
        self, learning_rates: Union[Callable[[int], float], Sequence[float]]
    ) -> None:
-        assert callable(learning_rates) or isinstance(
+        if not callable(learning_rates) and not isinstance(
            learning_rates, collections.abc.Sequence
-        )
+        ):
+            raise TypeError(
+                "Invalid learning rates, expecting callable or sequence, got: "
+                f"{type(learning_rates)}"
+            )
+
        if callable(learning_rates):
            self.learning_rates = learning_rates
        else:
@@ -302,24 +301,28 @@ class EarlyStopping(TrainingCallback):
    save_best :
        Whether training should return the best model or the last model.
    min_delta :
-        Minimum absolute change in score to be qualified as an improvement.

        .. versionadded:: 1.5.0

-        .. code-block:: python
+        Minimum absolute change in score to be qualified as an improvement.

-            es = xgboost.callback.EarlyStopping(
-                rounds=2,
-                min_delta=1e-3,
-                save_best=True,
-                maximize=False,
-                data_name="validation_0",
-                metric_name="mlogloss",
-            )
-            clf = xgboost.XGBClassifier(tree_method="gpu_hist", callbacks=[es])
+    Examples
+    --------

-            X, y = load_digits(return_X_y=True)
-            clf.fit(X, y, eval_set=[(X, y)])
+    .. code-block:: python
+
+        es = xgboost.callback.EarlyStopping(
+            rounds=2,
+            min_delta=1e-3,
+            save_best=True,
+            maximize=False,
+            data_name="validation_0",
+            metric_name="mlogloss",
+        )
+        clf = xgboost.XGBClassifier(tree_method="hist", device="cuda", callbacks=[es])
+
+        X, y = load_digits(return_X_y=True)
+        clf.fit(X, y, eval_set=[(X, y)])
    """

    # pylint: disable=too-many-arguments
@@ -363,7 +366,7 @@ class EarlyStopping(TrainingCallback):
            return numpy.greater(get_s(new) - self._min_delta, get_s(best))

        def minimize(new: _Score, best: _Score) -> bool:
-            """New score should be smaller than the old one."""
+            """New score should be lesser than the old one."""
            return numpy.greater(get_s(best) - self._min_delta, get_s(new))

        if self.maximize is None:
@@ -419,38 +422,53 @@ class EarlyStopping(TrainingCallback):
    ) -> bool:
        epoch += self.starting_round  # training continuation
        msg = "Must have at least 1 validation dataset for early stopping."
-        assert len(evals_log.keys()) >= 1, msg
-        data_name = ""
+        if len(evals_log.keys()) < 1:
+            raise ValueError(msg)
+
+        # Get data name
        if self.data:
-            for d, _ in evals_log.items():
-                if d == self.data:
-                    data_name = d
-            if not data_name:
-                raise ValueError("No dataset named:", self.data)
+            data_name = self.data
        else:
            # Use the last one as default.
            data_name = list(evals_log.keys())[-1]
-        assert isinstance(data_name, str) and data_name
+        if data_name not in evals_log:
+            raise ValueError(f"No dataset named: {data_name}")
+
+        if not isinstance(data_name, str):
+            raise TypeError(
+                f"The name of the dataset should be a string. Got: {type(data_name)}"
+            )
        data_log = evals_log[data_name]

-        # Filter out scores that can not be used for early stopping.
+        # Get metric name
        if self.metric_name:
            metric_name = self.metric_name
        else:
            # Use last metric by default.
-            assert isinstance(data_log, collections.OrderedDict)
            metric_name = list(data_log.keys())[-1]
+        if metric_name not in data_log:
+            raise ValueError(f"No metric named: {metric_name}")
+
+        # The latest score
        score = data_log[metric_name][-1]
        return self._update_rounds(score, data_name, metric_name, model, epoch)

    def after_training(self, model: _Model) -> _Model:
+        if not self.save_best:
+            return model
+
        try:
-            if self.save_best:
-                model = model[: int(model.attr("best_iteration")) + 1]
+            best_iteration = model.best_iteration
+            best_score = model.best_score
+            assert best_iteration is not None and best_score is not None
+            model = model[: best_iteration + 1]
+            model.best_iteration = best_iteration
+            model.best_score = best_score
        except XGBoostError as e:
            raise XGBoostError(
-                "`save_best` is not applicable to current booster"
+                "`save_best` is not applicable to the current booster"
            ) from e
+
        return model


@@ -462,8 +480,6 @@ class EvaluationMonitor(TrainingCallback):
    Parameters
    ----------

-    metric :
-        Extra user defined metric.
    rank :
        Which worker should be used for printing the result.
    period :
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1890,7 +1890,7 @@ class Booster:
        attr_names = from_cstr_to_pystr(sarr, length)
        return {n: self.attr(n) for n in attr_names}

-    def set_attr(self, **kwargs: Optional[str]) -> None:
+    def set_attr(self, **kwargs: Optional[Any]) -> None:
        """Set the attribute of the Booster.

        Parameters
@@ -2559,10 +2559,35 @@ class Booster:
        else:
            raise TypeError("Unknown file type: ", fname)

-        if self.attr("best_iteration") is not None:
-            self.best_iteration = int(cast(int, self.attr("best_iteration")))
-        if self.attr("best_score") is not None:
-            self.best_score = float(cast(float, self.attr("best_score")))
+    @property
+    def best_iteration(self) -> int:
+        """The best iteration during training."""
+        best = self.attr("best_iteration")
+        if best is not None:
+            return int(best)
+
+        raise AttributeError(
+            "`best_iteration` is only defined when early stopping is used."
+        )
+
+    @best_iteration.setter
+    def best_iteration(self, iteration: int) -> None:
+        self.set_attr(best_iteration=iteration)
+
+    @property
+    def best_score(self) -> float:
+        """The best evaluation score during training."""
+        best = self.attr("best_score")
+        if best is not None:
+            return float(best)
+
+        raise AttributeError(
+            "`best_score` is only defined when early stopping is used."
+        )
+
+    @best_score.setter
+    def best_score(self, score: int) -> None:
+        self.set_attr(best_score=score)

    def num_boosted_rounds(self) -> int:
        """Get number of boosted rounds.  For gblinear this is reset to 0 after
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -230,10 +230,10 @@ __model_doc = f"""
    subsample : Optional[float]
        Subsample ratio of the training instance.
    sampling_method :
-        Sampling method. Used only by `gpu_hist` tree method.
-          - `uniform`: select random training instances uniformly.
-          - `gradient_based` select random training instances with higher probability when
-            the gradient and hessian are larger. (cf. CatBoost)
+        Sampling method. Used only by the GPU version of ``hist`` tree method.
+          - ``uniform``: select random training instances uniformly.
+          - ``gradient_based`` select random training instances with higher probability
+            when the gradient and hessian are larger. (cf. CatBoost)
    colsample_bytree : Optional[float]
        Subsample ratio of columns when constructing each tree.
    colsample_bylevel : Optional[float]
@@ -992,12 +992,12 @@ class XGBModel(XGBModelBase):
        X :
            Feature matrix. See :ref:`py-data` for a list of supported types.

-            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            When the ``tree_method`` is set to ``hist``, internally, the
            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
            for conserving memory. However, this has performance implications when the
            device of input data is not matched with algorithm. For instance, if the
-            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
-            the data is first processed on CPU then transferred to GPU.
+            input is a numpy array on CPU but ``cuda`` is used for training, then the
+            data is first processed on CPU then transferred to GPU.
        y :
            Labels
        sample_weight :
@@ -1279,19 +1279,10 @@ class XGBModel(XGBModelBase):
            )
        return np.array(feature_names)

-    def _early_stopping_attr(self, attr: str) -> Union[float, int]:
-        booster = self.get_booster()
-        try:
-            return getattr(booster, attr)
-        except AttributeError as e:
-            raise AttributeError(
-                f"`{attr}` in only defined when early stopping is used."
-            ) from e
-
    @property
    def best_score(self) -> float:
        """The best score obtained by early stopping."""
-        return float(self._early_stopping_attr("best_score"))
+        return self.get_booster().best_score

    @property
    def best_iteration(self) -> int:
@@ -1299,7 +1290,7 @@ class XGBModel(XGBModelBase):
        for instance if the best iteration is the first round, then best_iteration is 0.

        """
-        return int(self._early_stopping_attr("best_iteration"))
+        return self.get_booster().best_iteration

    @property
    def feature_importances_(self) -> np.ndarray:
@@ -1926,12 +1917,12 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
            | 1   | :math:`x_{20}` | :math:`x_{21}` |
            +-----+----------------+----------------+

-            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            When the ``tree_method`` is set to ``hist``, internally, the
            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
            for conserving memory. However, this has performance implications when the
            device of input data is not matched with algorithm. For instance, if the
-            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
-            the data is first processed on CPU then transferred to GPU.
+            input is a numpy array on CPU but ``cuda`` is used for training, then the
+            data is first processed on CPU then transferred to GPU.
        y :
            Labels
        group :
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -28,17 +28,6 @@ from .core import (
 _CVFolds = Sequence["CVPack"]


-def _assert_new_callback(callbacks: Optional[Sequence[TrainingCallback]]) -> None:
-    is_new_callback: bool = not callbacks or all(
-        isinstance(c, TrainingCallback) for c in callbacks
-    )
-    if not is_new_callback:
-        link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
-        raise ValueError(
-            f"Old style callback was removed in version 1.6.  See: {link}."
-        )
-
-
 def _configure_custom_metric(
    feval: Optional[Metric], custom_metric: Optional[Metric]
 ) -> Optional[Metric]:
@@ -170,7 +159,6 @@ def train(
    bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
    start_iteration = 0

-    _assert_new_callback(callbacks)
    if verbose_eval:
        verbose_eval = 1 if verbose_eval is True else verbose_eval
        callbacks.append(EvaluationMonitor(period=verbose_eval))
@@ -247,7 +235,7 @@ class _PackedBooster:
        result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
        return result

-    def set_attr(self, **kwargs: Optional[str]) -> Any:
+    def set_attr(self, **kwargs: Optional[Any]) -> Any:
        """Iterate through folds for setting attributes"""
        for f in self.cvfolds:
            f.bst.set_attr(**kwargs)
@@ -274,11 +262,20 @@ class _PackedBooster:
        """Get best_iteration"""
        return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))

+    @best_iteration.setter
+    def best_iteration(self, iteration: int) -> None:
+        """Get best_iteration"""
+        self.set_attr(best_iteration=iteration)
+
    @property
    def best_score(self) -> float:
        """Get best_score."""
        return float(cast(float, self.cvfolds[0].bst.attr("best_score")))

+    @best_score.setter
+    def best_score(self, score: float) -> None:
+        self.set_attr(best_score=score)
+

 def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
    """
@@ -551,7 +548,6 @@ def cv(

    # setup callbacks
    callbacks = [] if callbacks is None else copy.copy(list(callbacks))
-    _assert_new_callback(callbacks)

    if verbose_eval:
        verbose_eval = 1 if verbose_eval is True else verbose_eval