Save model in ubj as the default. (#9947)

2024-01-05 17:53:36 +08:00
parent c03a4d5088
commit 38dd91f491
23 changed files with 598 additions and 550 deletions
--- a/tests/python/test_model_io.py
+++ b/tests/python/test_model_io.py
@@ -0,0 +1,406 @@
+import json
+import locale
+import os
+import pickle
+import tempfile
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import pytest
+
+import xgboost as xgb
+from xgboost import testing as tm
+
+
+def json_model(model_path: str, parameters: dict) -> dict:
+    datasets = pytest.importorskip("sklearn.datasets")
+
+    X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6)
+    if parameters.get("objective", None) == "multi:softmax":
+        parameters["num_class"] = 3
+
+    dm1 = xgb.DMatrix(X, y)
+
+    bst = xgb.train(parameters, dm1)
+    bst.save_model(model_path)
+
+    if model_path.endswith("ubj"):
+        import ubjson
+
+        with open(model_path, "rb") as ubjfd:
+            model = ubjson.load(ubjfd)
+    else:
+        with open(model_path, "r") as fd:
+            model = json.load(fd)
+
+    return model
+
+
+class TestBoosterIO:
+    def run_model_json_io(self, parameters: dict, ext: str) -> None:
+        config = xgb.config.get_config()
+        assert config["verbosity"] == 1
+
+        if ext == "ubj" and tm.no_ubjson()["condition"]:
+            pytest.skip(tm.no_ubjson()["reason"])
+
+        loc = locale.getpreferredencoding(False)
+        model_path = "test_model_json_io." + ext
+        j_model = json_model(model_path, parameters)
+        assert isinstance(j_model["learner"], dict)
+
+        bst = xgb.Booster(model_file=model_path)
+
+        bst.save_model(fname=model_path)
+        if ext == "ubj":
+            import ubjson
+
+            with open(model_path, "rb") as ubjfd:
+                j_model = ubjson.load(ubjfd)
+        else:
+            with open(model_path, "r") as fd:
+                j_model = json.load(fd)
+
+        assert isinstance(j_model["learner"], dict)
+
+        os.remove(model_path)
+        assert locale.getpreferredencoding(False) == loc
+
+        json_raw = bst.save_raw(raw_format="json")
+        from_jraw = xgb.Booster()
+        from_jraw.load_model(json_raw)
+
+        ubj_raw = bst.save_raw(raw_format="ubj")
+        from_ubjraw = xgb.Booster()
+        from_ubjraw.load_model(ubj_raw)
+
+        if parameters.get("multi_strategy", None) != "multi_output_tree":
+            # Old binary model is not supported for vector leaf.
+            with pytest.warns(Warning, match="Model format is default to UBJSON"):
+                old_from_json = from_jraw.save_raw(raw_format="deprecated")
+                old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
+
+            assert old_from_json == old_from_ubj
+
+        raw_json = bst.save_raw(raw_format="json")
+        pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n"
+        bst.load_model(bytearray(pretty, encoding="ascii"))
+
+        if parameters.get("multi_strategy", None) != "multi_output_tree":
+            # old binary model is not supported.
+            with pytest.warns(Warning, match="Model format is default to UBJSON"):
+                old_from_json = from_jraw.save_raw(raw_format="deprecated")
+                old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
+
+            assert old_from_json == old_from_ubj
+
+        rng = np.random.default_rng()
+        X = rng.random(size=from_jraw.num_features() * 10).reshape(
+            (10, from_jraw.num_features())
+        )
+        predt_from_jraw = from_jraw.predict(xgb.DMatrix(X))
+        predt_from_bst = bst.predict(xgb.DMatrix(X))
+        np.testing.assert_allclose(predt_from_jraw, predt_from_bst)
+
+    @pytest.mark.parametrize("ext", ["json", "ubj"])
+    def test_model_json_io(self, ext: str) -> None:
+        parameters = {"booster": "gbtree", "tree_method": "hist"}
+        self.run_model_json_io(parameters, ext)
+        parameters = {
+            "booster": "gbtree",
+            "tree_method": "hist",
+            "multi_strategy": "multi_output_tree",
+            "objective": "multi:softmax",
+        }
+        self.run_model_json_io(parameters, ext)
+        parameters = {"booster": "gblinear"}
+        self.run_model_json_io(parameters, ext)
+        parameters = {"booster": "dart", "tree_method": "hist"}
+        self.run_model_json_io(parameters, ext)
+
+    def test_categorical_model_io(self) -> None:
+        X, y = tm.make_categorical(256, 16, 71, False)
+        Xy = xgb.DMatrix(X, y, enable_categorical=True)
+        booster = xgb.train({"tree_method": "approx"}, Xy, num_boost_round=16)
+        predt_0 = booster.predict(Xy)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            path = os.path.join(tempdir, "model.deprecated")
+            with pytest.raises(ValueError, match=r".*JSON/UBJSON.*"):
+                with pytest.warns(Warning, match="Model format is default to UBJSON"):
+                    booster.save_model(path)
+
+            path = os.path.join(tempdir, "model.json")
+            booster.save_model(path)
+            booster = xgb.Booster(model_file=path)
+            predt_1 = booster.predict(Xy)
+            np.testing.assert_allclose(predt_0, predt_1)
+
+            path = os.path.join(tempdir, "model.ubj")
+            booster.save_model(path)
+            booster = xgb.Booster(model_file=path)
+            predt_1 = booster.predict(Xy)
+            np.testing.assert_allclose(predt_0, predt_1)
+
+    @pytest.mark.skipif(**tm.no_json_schema())
+    def test_json_io_schema(self) -> None:
+        import jsonschema
+
+        model_path = "test_json_schema.json"
+        path = os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        )
+        doc = os.path.join(path, "doc", "model.schema")
+        with open(doc, "r") as fd:
+            schema = json.load(fd)
+        parameters = {"tree_method": "hist", "booster": "gbtree"}
+        jsonschema.validate(instance=json_model(model_path, parameters), schema=schema)
+        os.remove(model_path)
+
+        parameters = {"tree_method": "hist", "booster": "dart"}
+        jsonschema.validate(instance=json_model(model_path, parameters), schema=schema)
+        os.remove(model_path)
+
+        try:
+            dtrain, _ = tm.load_agaricus(__file__)
+            xgb.train({"objective": "foo"}, dtrain, num_boost_round=1)
+        except ValueError as e:
+            e_str = str(e)
+            beg = e_str.find("Objective candidate")
+            end = e_str.find("Stack trace")
+            e_str = e_str[beg:end]
+            e_str = e_str.strip()
+            splited = e_str.splitlines()
+            objectives = [s.split(": ")[1] for s in splited]
+            j_objectives = schema["properties"]["learner"]["properties"]["objective"][
+                "oneOf"
+            ]
+            objectives_from_schema = set()
+            for j_obj in j_objectives:
+                objectives_from_schema.add(j_obj["properties"]["name"]["const"])
+            assert set(objectives) == objectives_from_schema
+
+    def test_model_binary_io(self) -> None:
+        model_path = "test_model_binary_io.deprecated"
+        parameters = {
+            "tree_method": "hist",
+            "booster": "gbtree",
+            "scale_pos_weight": "0.5",
+        }
+        X = np.random.random((10, 3))
+        y = np.random.random((10,))
+        dtrain = xgb.DMatrix(X, y)
+        bst = xgb.train(parameters, dtrain, num_boost_round=2)
+        with pytest.warns(Warning, match="Model format is default to UBJSON"):
+            bst.save_model(model_path)
+        bst = xgb.Booster(model_file=model_path)
+        os.remove(model_path)
+        config = json.loads(bst.save_config())
+        assert (
+            float(config["learner"]["objective"]["reg_loss_param"]["scale_pos_weight"])
+            == 0.5
+        )
+
+        buf = bst.save_raw()
+        from_raw = xgb.Booster()
+        from_raw.load_model(buf)
+
+        buf_from_raw = from_raw.save_raw()
+        assert buf == buf_from_raw
+
+    def test_with_pathlib(self) -> None:
+        """Saving and loading model files from paths."""
+        save_path = Path("model.ubj")
+
+        rng = np.random.default_rng(1994)
+
+        data = rng.normal(size=(100, 2))
+        target = np.array([0, 1] * 50)
+        features = ["Feature1", "Feature2"]
+
+        dm = xgb.DMatrix(data, label=target, feature_names=features)
+        params = {
+            "objective": "binary:logistic",
+            "eval_metric": "logloss",
+            "eta": 0.3,
+            "max_depth": 1,
+        }
+
+        bst = xgb.train(params, dm, num_boost_round=1)
+
+        # save, assert exists
+        bst.save_model(save_path)
+        assert save_path.exists()
+
+        def dump_assertions(dump: List[str]) -> None:
+            """Assertions for the expected dump from Booster"""
+            assert len(dump) == 1, "Exepcted only 1 tree to be dumped."
+            assert (
+                len(dump[0].splitlines()) == 3
+            ), "Expected 1 root and 2 leaves - 3 lines."
+
+        # load the model again using Path
+        bst2 = xgb.Booster(model_file=save_path)
+        dump2 = bst2.get_dump()
+        dump_assertions(dump2)
+
+        # load again using load_model
+        bst3 = xgb.Booster()
+        bst3.load_model(save_path)
+        dump3 = bst3.get_dump()
+        dump_assertions(dump3)
+
+        # remove file
+        Path.unlink(save_path)
+
+
+def save_load_model(model_path: str) -> None:
+    from sklearn.datasets import load_digits
+    from sklearn.model_selection import KFold
+
+    rng = np.random.RandomState(1994)
+
+    digits = load_digits(n_class=2)
+    y = digits["target"]
+    X = digits["data"]
+    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
+    for train_index, test_index in kf.split(X, y):
+        xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
+        xgb_model.save_model(model_path)
+
+        xgb_model = xgb.XGBClassifier()
+        xgb_model.load_model(model_path)
+
+        assert isinstance(xgb_model.classes_, np.ndarray)
+        np.testing.assert_equal(xgb_model.classes_, np.array([0, 1]))
+        assert isinstance(xgb_model._Booster, xgb.Booster)
+
+        preds = xgb_model.predict(X[test_index])
+        labels = y[test_index]
+        err = sum(
+            1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+        ) / float(len(preds))
+        assert err < 0.1
+        assert xgb_model.get_booster().attr("scikit_learn") is None
+
+        # test native booster
+        preds = xgb_model.predict(X[test_index], output_margin=True)
+        booster = xgb.Booster(model_file=model_path)
+        predt_1 = booster.predict(xgb.DMatrix(X[test_index]), output_margin=True)
+        assert np.allclose(preds, predt_1)
+
+        with pytest.raises(TypeError):
+            xgb_model = xgb.XGBModel()
+            xgb_model.load_model(model_path)
+
+    clf = xgb.XGBClassifier(booster="gblinear", early_stopping_rounds=1)
+    clf.fit(X, y, eval_set=[(X, y)])
+    best_iteration = clf.best_iteration
+    best_score = clf.best_score
+    predt_0 = clf.predict(X)
+    clf.save_model(model_path)
+    clf.load_model(model_path)
+    assert clf.booster == "gblinear"
+    predt_1 = clf.predict(X)
+    np.testing.assert_allclose(predt_0, predt_1)
+    assert clf.best_iteration == best_iteration
+    assert clf.best_score == best_score
+
+    clfpkl = pickle.dumps(clf)
+    clf = pickle.loads(clfpkl)
+    predt_2 = clf.predict(X)
+    np.testing.assert_allclose(predt_0, predt_2)
+    assert clf.best_iteration == best_iteration
+    assert clf.best_score == best_score
+
+
+@pytest.mark.skipif(**tm.no_sklearn())
+def test_sklearn_model() -> None:
+    from sklearn.datasets import load_digits
+    from sklearn.model_selection import train_test_split
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        model_path = os.path.join(tempdir, "digits.deprecated")
+        with pytest.warns(Warning, match="Model format is default to UBJSON"):
+            save_load_model(model_path)
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        model_path = os.path.join(tempdir, "digits.model.json")
+        save_load_model(model_path)
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        model_path = os.path.join(tempdir, "digits.model.ubj")
+        digits = load_digits(n_class=2)
+        y = digits["target"]
+        X = digits["data"]
+        booster = xgb.train(
+            {"tree_method": "hist", "objective": "binary:logistic"},
+            dtrain=xgb.DMatrix(X, y),
+            num_boost_round=4,
+        )
+        predt_0 = booster.predict(xgb.DMatrix(X))
+        booster.save_model(model_path)
+        cls = xgb.XGBClassifier()
+        cls.load_model(model_path)
+
+        proba = cls.predict_proba(X)
+        assert proba.shape[0] == X.shape[0]
+        assert proba.shape[1] == 2  # binary
+
+        predt_1 = cls.predict_proba(X)[:, 1]
+        assert np.allclose(predt_0, predt_1)
+
+        cls = xgb.XGBModel()
+        cls.load_model(model_path)
+        predt_1 = cls.predict(X)
+        assert np.allclose(predt_0, predt_1)
+
+        # mclass
+        X, y = load_digits(n_class=10, return_X_y=True)
+        # small test_size to force early stop
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.01, random_state=1
+        )
+        clf = xgb.XGBClassifier(
+            n_estimators=64, tree_method="hist", early_stopping_rounds=2
+        )
+        clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+        score = clf.best_score
+        clf.save_model(model_path)
+
+        clf = xgb.XGBClassifier()
+        clf.load_model(model_path)
+        assert clf.classes_.size == 10
+        assert clf.objective == "multi:softprob"
+
+        np.testing.assert_equal(clf.classes_, np.arange(10))
+        assert clf.n_classes_ == 10
+
+        assert clf.best_iteration == 27
+        assert clf.best_score == score
+
+
+@pytest.mark.skipif(**tm.no_sklearn())
+def test_with_sklearn_obj_metric() -> None:
+    from sklearn.metrics import mean_squared_error
+
+    X, y = tm.datasets.make_regression()
+    reg = xgb.XGBRegressor(objective=tm.ls_obj, eval_metric=mean_squared_error)
+    reg.fit(X, y)
+
+    pkl = pickle.dumps(reg)
+    reg_1 = pickle.loads(pkl)
+    assert callable(reg_1.objective)
+    assert callable(reg_1.eval_metric)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = os.path.join(tmpdir, "model.json")
+        reg.save_model(path)
+
+        reg_2 = xgb.XGBRegressor()
+        reg_2.load_model(path)
+
+    assert not callable(reg_2.objective)
+    assert not callable(reg_2.eval_metric)
+    assert reg_2.eval_metric is None