Use UBJ in Python checkpoint. (#9958)

This commit is contained in:
Jiaming Yuan 2024-01-09 03:22:15 +08:00 committed by GitHub
parent fa5e2f6c45
commit b3eb5d0945
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 104 additions and 46 deletions

View File

@ -7,6 +7,7 @@ Demo for using and defining callback functions
import argparse import argparse
import os import os
import tempfile import tempfile
from typing import Dict
import numpy as np import numpy as np
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
@ -17,24 +18,26 @@ import xgboost as xgb
class Plotting(xgb.callback.TrainingCallback): class Plotting(xgb.callback.TrainingCallback):
"""Plot evaluation result during training. Only for demonstration purpose as it's quite """Plot evaluation result during training. Only for demonstration purpose as it's
slow to draw. quite slow to draw using matplotlib.
""" """
def __init__(self, rounds): def __init__(self, rounds: int) -> None:
self.fig = plt.figure() self.fig = plt.figure()
self.ax = self.fig.add_subplot(111) self.ax = self.fig.add_subplot(111)
self.rounds = rounds self.rounds = rounds
self.lines = {} self.lines: Dict[str, plt.Line2D] = {}
self.fig.show() self.fig.show()
self.x = np.linspace(0, self.rounds, self.rounds) self.x = np.linspace(0, self.rounds, self.rounds)
plt.ion() plt.ion()
def _get_key(self, data, metric): def _get_key(self, data: str, metric: str) -> str:
return f"{data}-{metric}" return f"{data}-{metric}"
def after_iteration(self, model, epoch, evals_log): def after_iteration(
self, model: xgb.Booster, epoch: int, evals_log: Dict[str, dict]
) -> bool:
"""Update the plot.""" """Update the plot."""
if not self.lines: if not self.lines:
for data, metric in evals_log.items(): for data, metric in evals_log.items():
@ -55,7 +58,7 @@ class Plotting(xgb.callback.TrainingCallback):
return False return False
def custom_callback(): def custom_callback() -> None:
"""Demo for defining a custom callback function that plots evaluation result during """Demo for defining a custom callback function that plots evaluation result during
training.""" training."""
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
@ -82,19 +85,27 @@ def custom_callback():
) )
def check_point_callback(): def check_point_callback() -> None:
# only for demo, set a larger value (like 100) in practice as checkpointing is quite """Demo for using the checkpoint callback. Custom logic for handling output is
usually required and users are encouraged to define their own callback for
checkpointing operations. The builtin one can be used as a starting point.
"""
# Only for demo, set a larger value (like 100) in practice as checkpointing is quite
# slow. # slow.
rounds = 2 rounds = 2
def check(as_pickle): def check(as_pickle: bool) -> None:
for i in range(0, 10, rounds): for i in range(0, 10, rounds):
if i == 0: if i == 0:
continue continue
if as_pickle: if as_pickle:
path = os.path.join(tmpdir, "model_" + str(i) + ".pkl") path = os.path.join(tmpdir, "model_" + str(i) + ".pkl")
else: else:
path = os.path.join(tmpdir, "model_" + str(i) + ".json") path = os.path.join(
tmpdir,
f"model_{i}.{xgb.callback.TrainingCheckPoint.default_format}",
)
assert os.path.exists(path) assert os.path.exists(path)
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)

View File

@ -88,22 +88,18 @@ Callback API
.. autoclass:: xgboost.callback.EvaluationMonitor .. autoclass:: xgboost.callback.EvaluationMonitor
:members: :members:
:inherited-members:
:show-inheritance: :show-inheritance:
.. autoclass:: xgboost.callback.EarlyStopping .. autoclass:: xgboost.callback.EarlyStopping
:members: :members:
:inherited-members:
:show-inheritance: :show-inheritance:
.. autoclass:: xgboost.callback.LearningRateScheduler .. autoclass:: xgboost.callback.LearningRateScheduler
:members: :members:
:inherited-members:
:show-inheritance: :show-inheritance:
.. autoclass:: xgboost.callback.TrainingCheckPoint .. autoclass:: xgboost.callback.TrainingCheckPoint
:members: :members:
:inherited-members:
:show-inheritance: :show-inheritance:
.. _dask_api: .. _dask_api:

View File

@ -62,11 +62,31 @@ class TrainingCallback(ABC):
return model return model
def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool: def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
"""Run before each iteration. Return True when training should stop.""" """Run before each iteration. Returns True when training should stop. See
:py:meth:`after_iteration` for details.
"""
return False return False
def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool: def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
"""Run after each iteration. Return True when training should stop.""" """Run after each iteration. Returns `True` when training should stop.
Parameters
----------
model :
Eeither a :py:class:`~xgboost.Booster` object or a CVPack if the cv function
in xgboost is being used.
epoch :
The current training iteration.
evals_log :
A dictionary containing the evaluation history:
.. code-block:: python
{"data_name": {"metric_name": [0.5, ...]}}
"""
return False return False
@ -547,14 +567,16 @@ class TrainingCheckPoint(TrainingCallback):
.. versionadded:: 1.3.0 .. versionadded:: 1.3.0
Since XGBoost 2.1.0, the default format is changed to UBJSON.
Parameters Parameters
---------- ----------
directory : directory :
Output model directory. Output model directory.
name : name :
pattern of output model file. Models will be saved as name_0.json, name_1.json, pattern of output model file. Models will be saved as name_0.ubj, name_1.ubj,
name_2.json .... name_2.ubj ....
as_pickle : as_pickle :
When set to True, all training parameters will be saved in pickle format, When set to True, all training parameters will be saved in pickle format,
instead of saving only the model. instead of saving only the model.
@ -564,6 +586,8 @@ class TrainingCheckPoint(TrainingCallback):
""" """
default_format = "ubj"
def __init__( def __init__(
self, self,
directory: Union[str, os.PathLike], directory: Union[str, os.PathLike],
@ -592,7 +616,7 @@ class TrainingCheckPoint(TrainingCallback):
self._name self._name
+ "_" + "_"
+ (str(epoch + self._start)) + (str(epoch + self._start))
+ (".pkl" if self._as_pickle else ".json"), + (".pkl" if self._as_pickle else f".{self.default_format}"),
) )
self._epoch = 0 # reset counter self._epoch = 0 # reset counter
if collective.get_rank() == 0: if collective.get_rank() == 0:

View File

@ -2591,9 +2591,8 @@ class Booster:
The model is saved in an XGBoost internal format which is universal among the The model is saved in an XGBoost internal format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster object various XGBoost interfaces. Auxiliary attributes of the Python Booster object
(such as feature_names) will not be saved when using binary format. To save (such as feature_names) are only saved when using JSON or UBJSON (default)
those attributes, use JSON/UBJ instead. See :doc:`Model IO format. See :doc:`Model IO </tutorials/saving_model>` for more info.
</tutorials/saving_model>` for more info.
.. code-block:: python .. code-block:: python
@ -2616,12 +2615,15 @@ class Booster:
def save_raw(self, raw_format: str = "ubj") -> bytearray: def save_raw(self, raw_format: str = "ubj") -> bytearray:
"""Save the model to a in memory buffer representation instead of file. """Save the model to a in memory buffer representation instead of file.
The model is saved in an XGBoost internal format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
(such as feature_names) are only saved when using JSON or UBJSON (default)
format. See :doc:`Model IO </tutorials/saving_model>` for more info.
Parameters Parameters
---------- ----------
raw_format : raw_format :
Format of output buffer. Can be `json`, `ubj` or `deprecated`. Right now Format of output buffer. Can be `json`, `ubj` or `deprecated`.
the default is `deprecated` but it will be changed to `ubj` (univeral binary
json) in the future.
Returns Returns
------- -------
@ -2640,11 +2642,10 @@ class Booster:
def load_model(self, fname: ModelIn) -> None: def load_model(self, fname: ModelIn) -> None:
"""Load the model from a file or a bytearray. """Load the model from a file or a bytearray.
The model is loaded from XGBoost format which is universal among the various The model is saved in an XGBoost internal format which is universal among the
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as various XGBoost interfaces. Auxiliary attributes of the Python Booster object
feature_names) will not be loaded when using binary format. To save those (such as feature_names) are only saved when using JSON or UBJSON (default)
attributes, use JSON/UBJ instead. See :doc:`Model IO </tutorials/saving_model>` format. See :doc:`Model IO </tutorials/saving_model>` for more info.
for more info.
.. code-block:: python .. code-block:: python
@ -2769,9 +2770,9 @@ class Booster:
with_stats: bool = False, with_stats: bool = False,
dump_format: str = "text", dump_format: str = "text",
) -> List[str]: ) -> List[str]:
"""Returns the model dump as a list of strings. Unlike :py:meth:`save_model`, the output """Returns the model dump as a list of strings. Unlike :py:meth:`save_model`,
format is primarily used for visualization or interpretation, hence it's more the output format is primarily used for visualization or interpretation, hence
human readable but cannot be loaded back to XGBoost. it's more human readable but cannot be loaded back to XGBoost.
Parameters Parameters
---------- ----------

View File

@ -31,6 +31,8 @@ class LintersPaths:
"tests/python/test_with_pandas.py", "tests/python/test_with_pandas.py",
"tests/python-gpu/", "tests/python-gpu/",
"tests/python-sycl/", "tests/python-sycl/",
"tests/test_distributed/test_with_dask/",
"tests/test_distributed/test_gpu_with_dask/",
"tests/test_distributed/test_with_spark/", "tests/test_distributed/test_with_spark/",
"tests/test_distributed/test_gpu_with_spark/", "tests/test_distributed/test_gpu_with_spark/",
# demo # demo
@ -91,6 +93,7 @@ class LintersPaths:
# demo # demo
"demo/json-model/json_parser.py", "demo/json-model/json_parser.py",
"demo/guide-python/external_memory.py", "demo/guide-python/external_memory.py",
"demo/guide-python/callbacks.py",
"demo/guide-python/cat_in_the_dat.py", "demo/guide-python/cat_in_the_dat.py",
"demo/guide-python/categorical.py", "demo/guide-python/categorical.py",
"demo/guide-python/cat_pipeline.py", "demo/guide-python/cat_pipeline.py",

View File

@ -244,7 +244,7 @@ class TestCallbacks:
assert booster.num_boosted_rounds() == booster.best_iteration + 1 assert booster.num_boosted_rounds() == booster.best_iteration + 1
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, 'model.json') path = os.path.join(tmpdir, "model.json")
cls.save_model(path) cls.save_model(path)
cls = xgb.XGBClassifier() cls = xgb.XGBClassifier()
cls.load_model(path) cls.load_model(path)
@ -378,7 +378,7 @@ class TestCallbacks:
scheduler = xgb.callback.LearningRateScheduler scheduler = xgb.callback.LearningRateScheduler
dtrain, dtest = tm.load_agaricus(__file__) dtrain, dtest = tm.load_agaricus(__file__)
watchlist = [(dtest, 'eval'), (dtrain, 'train')] watchlist = [(dtest, "eval"), (dtrain, "train")]
param = { param = {
"max_depth": 2, "max_depth": 2,
@ -429,7 +429,7 @@ class TestCallbacks:
assert tree_3th_0["split_conditions"] != tree_3th_1["split_conditions"] assert tree_3th_0["split_conditions"] != tree_3th_1["split_conditions"]
@pytest.mark.parametrize("tree_method", ["hist", "approx", "approx"]) @pytest.mark.parametrize("tree_method", ["hist", "approx", "approx"])
def test_eta_decay(self, tree_method): def test_eta_decay(self, tree_method: str) -> None:
self.run_eta_decay(tree_method) self.run_eta_decay(tree_method)
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -446,7 +446,7 @@ class TestCallbacks:
def test_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None: def test_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
self.run_eta_decay_leaf_output(tree_method, objective) self.run_eta_decay_leaf_output(tree_method, objective)
def test_check_point(self): def test_check_point(self) -> None:
from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
@ -463,7 +463,12 @@ class TestCallbacks:
callbacks=[check_point], callbacks=[check_point],
) )
for i in range(1, 10): for i in range(1, 10):
assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".json")) assert os.path.exists(
os.path.join(
tmpdir,
f"model_{i}.{xgb.callback.TrainingCheckPoint.default_format}",
)
)
check_point = xgb.callback.TrainingCheckPoint( check_point = xgb.callback.TrainingCheckPoint(
directory=tmpdir, interval=1, as_pickle=True, name="model" directory=tmpdir, interval=1, as_pickle=True, name="model"
@ -478,7 +483,7 @@ class TestCallbacks:
for i in range(1, 10): for i in range(1, 10):
assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".pkl")) assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".pkl"))
def test_callback_list(self): def test_callback_list(self) -> None:
X, y = tm.data.get_california_housing() X, y = tm.data.get_california_housing()
m = xgb.DMatrix(X, y) m = xgb.DMatrix(X, y)
callbacks = [xgb.callback.EarlyStopping(rounds=10)] callbacks = [xgb.callback.EarlyStopping(rounds=10)]

View File

@ -1590,7 +1590,7 @@ class TestWithDask:
@given( @given(
params=hist_parameter_strategy, params=hist_parameter_strategy,
cache_param=hist_cache_strategy, cache_param=hist_cache_strategy,
dataset=tm.make_dataset_strategy() dataset=tm.make_dataset_strategy(),
) )
@settings( @settings(
deadline=None, max_examples=10, suppress_health_check=suppress, print_blob=True deadline=None, max_examples=10, suppress_health_check=suppress, print_blob=True
@ -2250,16 +2250,27 @@ class TestDaskCallbacks:
], ],
) )
for i in range(1, 10): for i in range(1, 10):
assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".json")) assert os.path.exists(
os.path.join(
tmpdir,
f"model_{i}.{xgb.callback.TrainingCheckPoint.default_format}",
)
)
@gen_cluster(client=True, clean_kwargs={"processes": False, "threads": False}, allow_unclosed=True) @gen_cluster(
client=True,
clean_kwargs={"processes": False, "threads": False},
allow_unclosed=True,
)
async def test_worker_left(c, s, a, b): async def test_worker_left(c, s, a, b):
async with Worker(s.address): async with Worker(s.address):
dx = da.random.random((1000, 10)).rechunk(chunks=(10, None)) dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))
dy = da.random.random((1000,)).rechunk(chunks=(10,)) dy = da.random.random((1000,)).rechunk(chunks=(10,))
d_train = await xgb.dask.DaskDMatrix( d_train = await xgb.dask.DaskDMatrix(
c, dx, dy, c,
dx,
dy,
) )
await async_poll_for(lambda: len(s.workers) == 2, timeout=5) await async_poll_for(lambda: len(s.workers) == 2, timeout=5)
with pytest.raises(RuntimeError, match="Missing"): with pytest.raises(RuntimeError, match="Missing"):
@ -2271,12 +2282,19 @@ async def test_worker_left(c, s, a, b):
) )
@gen_cluster(client=True, Worker=Nanny, clean_kwargs={"processes": False, "threads": False}, allow_unclosed=True) @gen_cluster(
client=True,
Worker=Nanny,
clean_kwargs={"processes": False, "threads": False},
allow_unclosed=True,
)
async def test_worker_restarted(c, s, a, b): async def test_worker_restarted(c, s, a, b):
dx = da.random.random((1000, 10)).rechunk(chunks=(10, None)) dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))
dy = da.random.random((1000,)).rechunk(chunks=(10,)) dy = da.random.random((1000,)).rechunk(chunks=(10,))
d_train = await xgb.dask.DaskDMatrix( d_train = await xgb.dask.DaskDMatrix(
c, dx, dy, c,
dx,
dy,
) )
await c.restart_workers([a.worker_address]) await c.restart_workers([a.worker_address])
with pytest.raises(RuntimeError, match="Missing"): with pytest.raises(RuntimeError, match="Missing"):