Use UBJ in Python checkpoint. (#9958)

2024-01-09 03:22:15 +08:00
parent fa5e2f6c45
commit b3eb5d0945
7 changed files with 104 additions and 46 deletions
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -62,11 +62,31 @@ class TrainingCallback(ABC):
        return model

    def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
-        """Run before each iteration.  Return True when training should stop."""
+        """Run before each iteration.  Returns True when training should stop. See
+        :py:meth:`after_iteration` for details.
+
+        """
        return False

    def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
-        """Run after each iteration.  Return True when training should stop."""
+        """Run after each iteration.  Returns `True` when training should stop.
+
+        Parameters
+        ----------
+
+        model :
+            Eeither a :py:class:`~xgboost.Booster` object or a CVPack if the cv function
+            in xgboost is being used.
+        epoch :
+            The current training iteration.
+        evals_log :
+            A dictionary containing the evaluation history:
+
+            .. code-block:: python
+
+                {"data_name": {"metric_name": [0.5, ...]}}
+
+        """
        return False


@@ -547,14 +567,16 @@ class TrainingCheckPoint(TrainingCallback):

    .. versionadded:: 1.3.0

+    Since XGBoost 2.1.0, the default format is changed to UBJSON.
+
    Parameters
    ----------

    directory :
        Output model directory.
    name :
-        pattern of output model file.  Models will be saved as name_0.json, name_1.json,
-        name_2.json ....
+        pattern of output model file.  Models will be saved as name_0.ubj, name_1.ubj,
+        name_2.ubj ....
    as_pickle :
        When set to True, all training parameters will be saved in pickle format,
        instead of saving only the model.
@@ -564,6 +586,8 @@ class TrainingCheckPoint(TrainingCallback):

    """

+    default_format = "ubj"
+
    def __init__(
        self,
        directory: Union[str, os.PathLike],
@@ -592,7 +616,7 @@ class TrainingCheckPoint(TrainingCallback):
                self._name
                + "_"
                + (str(epoch + self._start))
-                + (".pkl" if self._as_pickle else ".json"),
+                + (".pkl" if self._as_pickle else f".{self.default_format}"),
            )
            self._epoch = 0  # reset counter
            if collective.get_rank() == 0:
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2591,9 +2591,8 @@ class Booster:

        The model is saved in an XGBoost internal format which is universal among the
        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
-        (such as feature_names) will not be saved when using binary format.  To save
-        those attributes, use JSON/UBJ instead. See :doc:`Model IO
-        </tutorials/saving_model>` for more info.
+        (such as feature_names) are only saved when using JSON or UBJSON (default)
+        format. See :doc:`Model IO </tutorials/saving_model>` for more info.

        .. code-block:: python

@@ -2616,12 +2615,15 @@ class Booster:
    def save_raw(self, raw_format: str = "ubj") -> bytearray:
        """Save the model to a in memory buffer representation instead of file.

+        The model is saved in an XGBoost internal format which is universal among the
+        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
+        (such as feature_names) are only saved when using JSON or UBJSON (default)
+        format. See :doc:`Model IO </tutorials/saving_model>` for more info.
+
        Parameters
        ----------
        raw_format :
-            Format of output buffer. Can be `json`, `ubj` or `deprecated`.  Right now
-            the default is `deprecated` but it will be changed to `ubj` (univeral binary
-            json) in the future.
+            Format of output buffer. Can be `json`, `ubj` or `deprecated`.

        Returns
        -------
@@ -2640,11 +2642,10 @@ class Booster:
    def load_model(self, fname: ModelIn) -> None:
        """Load the model from a file or a bytearray.

-        The model is loaded from XGBoost format which is universal among the various
-        XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
-        feature_names) will not be loaded when using binary format.  To save those
-        attributes, use JSON/UBJ instead.  See :doc:`Model IO </tutorials/saving_model>`
-        for more info.
+        The model is saved in an XGBoost internal format which is universal among the
+        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
+        (such as feature_names) are only saved when using JSON or UBJSON (default)
+        format. See :doc:`Model IO </tutorials/saving_model>` for more info.

        .. code-block:: python

@@ -2769,9 +2770,9 @@ class Booster:
        with_stats: bool = False,
        dump_format: str = "text",
    ) -> List[str]:
-        """Returns the model dump as a list of strings.  Unlike :py:meth:`save_model`, the output
-        format is primarily used for visualization or interpretation, hence it's more
-        human readable but cannot be loaded back to XGBoost.
+        """Returns the model dump as a list of strings.  Unlike :py:meth:`save_model`,
+        the output format is primarily used for visualization or interpretation, hence
+        it's more human readable but cannot be loaded back to XGBoost.

        Parameters
        ----------