Test loading models with invalid file extensions. (#9955)
This commit is contained in:
parent
3ff3a5f1ed
commit
9a30bdd313
@ -2,14 +2,20 @@
|
||||
Introduction to Model IO
|
||||
########################
|
||||
|
||||
Since 2.1.0, the default model format for XGBoost is the UBJSON format, the option is
|
||||
enabled for serializing models to file, serializing models to buffer, and for memory
|
||||
snapshot (pickle and alike).
|
||||
|
||||
In XGBoost 1.0.0, we introduced support of using `JSON
|
||||
<https://www.json.org/json-en.html>`_ for saving/loading XGBoost models and related
|
||||
hyper-parameters for training, aiming to replace the old binary internal format with an
|
||||
open format that can be easily reused. Later in XGBoost 1.6.0, additional support for
|
||||
`Universal Binary JSON <https://ubjson.org/>`__ is added as an optimization for more
|
||||
efficient model IO. They have the same document structure with different representations,
|
||||
and we will refer them collectively as the JSON format. This tutorial aims to share some
|
||||
basic insights into the JSON serialisation method used in XGBoost. Without explicitly
|
||||
efficient model IO, which is set to default in 2.1.
|
||||
|
||||
JSON and UBJSON have the same document structure with different representations, and we
|
||||
will refer them collectively as the JSON format. This tutorial aims to share some basic
|
||||
insights into the JSON serialisation method used in XGBoost. Without explicitly
|
||||
mentioned, the following sections assume you are using the one of the 2 outputs formats,
|
||||
which can be enabled by providing the file name with ``.json`` (or ``.ubj`` for binary
|
||||
JSON) as file extension when saving/loading model: ``booster.save_model('model.json')``.
|
||||
@ -25,12 +31,13 @@ If you come from Deep Learning community, then it should be
|
||||
clear to you that there are differences between the neural network structures composed of
|
||||
weights with fixed tensor operations, and the optimizers (like RMSprop) used to train them.
|
||||
|
||||
So when one calls ``booster.save_model`` (``xgb.save`` in R), XGBoost saves the trees, some model
|
||||
parameters like number of input columns in trained trees, and the objective function, which combined
|
||||
to represent the concept of "model" in XGBoost. As for why are we saving the objective as
|
||||
part of model, that's because objective controls transformation of global bias (called
|
||||
``base_score`` in XGBoost). Users can share this model with others for prediction,
|
||||
evaluation or continue the training with a different set of hyper-parameters etc.
|
||||
So when one calls ``booster.save_model`` (``xgb.save`` in R), XGBoost saves the trees,
|
||||
some model parameters like number of input columns in trained trees, and the objective
|
||||
function, which combined to represent the concept of "model" in XGBoost. As for why are
|
||||
we saving the objective as part of model, that's because objective controls transformation
|
||||
of global bias (called ``base_score`` in XGBoost) and task-specific information. Users
|
||||
can share this model with others for prediction, evaluation or continue the training with
|
||||
a different set of hyper-parameters etc.
|
||||
|
||||
However, this is not the end of story. There are cases where we need to save something
|
||||
more than just the model itself. For example, in distributed training, XGBoost performs
|
||||
@ -81,7 +88,10 @@ a filename with ``.json`` or ``.ubj`` as file extension, the latter is the exten
|
||||
JSON files that were produced by an external source may lead to undefined behaviors
|
||||
and crashes.
|
||||
|
||||
While for memory snapshot, UBJSON is the default starting with xgboost 1.6.
|
||||
While for memory snapshot, UBJSON is the default starting with xgboost 1.6. When loading
|
||||
the model back, XGBoost recognizes the file extensions ``.json`` and ``.ubj``, and can
|
||||
dispatch accordingly. If the extension is not specified, XGBoost tries to guess the right
|
||||
one.
|
||||
|
||||
***************************************************************
|
||||
A note on backward compatibility of models and memory snapshots
|
||||
|
||||
@ -254,6 +254,68 @@ class TestBoosterIO:
|
||||
# remove file
|
||||
Path.unlink(save_path)
|
||||
|
||||
def test_invalid_postfix(self) -> None:
|
||||
"""Test mis-specified model format, no special hanlding is expected, the
|
||||
JSON/UBJ parser can emit parsing errors.
|
||||
|
||||
"""
|
||||
X, y, w = tm.make_regression(64, 16, False)
|
||||
booster = xgb.train({}, xgb.QuantileDMatrix(X, y, weight=w), num_boost_round=3)
|
||||
|
||||
def rename(src: str, dst: str) -> None:
|
||||
if os.path.exists(dst):
|
||||
# Windows cannot overwrite an existing file.
|
||||
os.remove(dst)
|
||||
os.rename(src, dst)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
path_dep = os.path.join(tmpdir, "model.deprecated")
|
||||
# save into deprecated format
|
||||
with pytest.warns(UserWarning, match="UBJSON"):
|
||||
booster.save_model(path_dep)
|
||||
|
||||
path_ubj = os.path.join(tmpdir, "model.ubj")
|
||||
rename(path_dep, path_ubj)
|
||||
|
||||
with pytest.raises(ValueError, match="{"):
|
||||
xgb.Booster(model_file=path_ubj)
|
||||
|
||||
path_json = os.path.join(tmpdir, "model.json")
|
||||
rename(path_ubj, path_json)
|
||||
|
||||
with pytest.raises(ValueError, match="{"):
|
||||
xgb.Booster(model_file=path_json)
|
||||
|
||||
# save into ubj format
|
||||
booster.save_model(path_ubj)
|
||||
rename(path_ubj, path_dep)
|
||||
# deprecated is not a recognized format internally, XGBoost can guess the
|
||||
# right format
|
||||
xgb.Booster(model_file=path_dep)
|
||||
rename(path_dep, path_json)
|
||||
with pytest.raises(ValueError, match="Expecting"):
|
||||
xgb.Booster(model_file=path_json)
|
||||
|
||||
# save into JSON format
|
||||
booster.save_model(path_json)
|
||||
rename(path_json, path_dep)
|
||||
# deprecated is not a recognized format internally, XGBoost can guess the
|
||||
# right format
|
||||
xgb.Booster(model_file=path_dep)
|
||||
rename(path_dep, path_ubj)
|
||||
with pytest.raises(ValueError, match="Expecting"):
|
||||
xgb.Booster(model_file=path_ubj)
|
||||
|
||||
# save model without file extension
|
||||
path_no = os.path.join(tmpdir, "model")
|
||||
with pytest.warns(UserWarning, match="UBJSON"):
|
||||
booster.save_model(path_no)
|
||||
|
||||
booster_1 = xgb.Booster(model_file=path_no)
|
||||
r0 = booster.save_raw(raw_format="json")
|
||||
r1 = booster_1.save_raw(raw_format="json")
|
||||
assert r0 == r1
|
||||
|
||||
|
||||
def save_load_model(model_path: str) -> None:
|
||||
from sklearn.datasets import load_digits
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user