Additional tests for attributes and model booosted rounds. (#9962)

2024-01-09 09:54:39 +08:00
parent bed0349954
commit 2f57bbde3c
3 changed files with 90 additions and 30 deletions
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -535,8 +535,7 @@ class LearnerConfiguration : public Learner {

    tparam_.booster = get<String>(gradient_booster["name"]);
    if (!gbm_) {
-      gbm_.reset(GradientBooster::Create(tparam_.booster,
-                                         &ctx_, &learner_model_param_));
+      gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_, &learner_model_param_));
    }
    gbm_->LoadConfig(gradient_booster);

@@ -1095,6 +1094,11 @@ class LearnerIO : public LearnerConfiguration {
    std::vector<std::pair<std::string, std::string> > extra_attr;
    mparam.contain_extra_attrs = 1;

+    if (!this->feature_names_.empty() || !this->feature_types_.empty()) {
+      LOG(WARNING) << "feature names and feature types are being disregarded, use JSON/UBJSON "
+                      "format instead.";
+    }
+
    {
      // Similar to JSON model IO, we save the objective.
      Json j_obj { Object() };
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -1,5 +1,4 @@
 import json
-import locale
 import os
 import tempfile

@@ -110,20 +109,39 @@ class TestModels:
        predt_2 = bst.predict(dtrain)
        assert np.all(np.abs(predt_2 - predt_1) < 1e-6)

-    def test_boost_from_existing_model(self):
+    def test_boost_from_existing_model(self) -> None:
        X, _ = tm.load_agaricus(__file__)
-        booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4)
+        booster = xgb.train({"tree_method": "hist"}, X, num_boost_round=4)
        assert booster.num_boosted_rounds() == 4
-        booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4,
-                            xgb_model=booster)
+        booster.set_param({"tree_method": "approx"})
+        assert booster.num_boosted_rounds() == 4
+        booster = xgb.train(
+            {"tree_method": "hist"}, X, num_boost_round=4, xgb_model=booster
+        )
        assert booster.num_boosted_rounds() == 8
-        booster = xgb.train({'updater': 'prune', 'process_type': 'update'}, X,
-                            num_boost_round=4, xgb_model=booster)
+        with pytest.warns(UserWarning, match="`updater`"):
+            booster = xgb.train(
+                {"updater": "prune", "process_type": "update"},
+                X,
+                num_boost_round=4,
+                xgb_model=booster,
+            )
        # Trees are moved for update, the rounds is reduced.  This test is
        # written for being compatible with current code (1.0.0).  If the
        # behaviour is considered sub-optimal, feel free to change.
        assert booster.num_boosted_rounds() == 4

+        booster = xgb.train({"booster": "gblinear"}, X, num_boost_round=4)
+        assert booster.num_boosted_rounds() == 4
+        booster.set_param({"updater": "coord_descent"})
+        assert booster.num_boosted_rounds() == 4
+        booster.set_param({"updater": "shotgun"})
+        assert booster.num_boosted_rounds() == 4
+        booster = xgb.train(
+            {"booster": "gblinear"}, X, num_boost_round=4, xgb_model=booster
+        )
+        assert booster.num_boosted_rounds() == 8
+
    def run_custom_objective(self, tree_method=None):
        param = {
            'max_depth': 2,
@@ -307,25 +325,6 @@ class TestModels:
        for d in text_dump:
            assert d.find(r"feature \"2\"") != -1

-    @pytest.mark.skipif(**tm.no_sklearn())
-    def test_attributes(self):
-        from sklearn.datasets import load_iris
-
-        X, y = load_iris(return_X_y=True)
-        cls = xgb.XGBClassifier(n_estimators=2)
-        cls.fit(X, y, early_stopping_rounds=1, eval_set=[(X, y)])
-        assert cls.get_booster().best_iteration == cls.n_estimators - 1
-        assert cls.best_iteration == cls.get_booster().best_iteration
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path = os.path.join(tmpdir, "cls.json")
-            cls.save_model(path)
-
-            cls = xgb.XGBClassifier(n_estimators=2)
-            cls.load_model(path)
-            assert cls.get_booster().best_iteration == cls.n_estimators - 1
-            assert cls.best_iteration == cls.get_booster().best_iteration
-
    def run_slice(
        self,
        booster: xgb.Booster,
@@ -493,18 +492,23 @@ class TestModels:
        np.testing.assert_allclose(predt0, predt1, atol=1e-5)

    @pytest.mark.skipif(**tm.no_pandas())
-    def test_feature_info(self):
+    @pytest.mark.parametrize("ext", ["json", "ubj"])
+    def test_feature_info(self, ext: str) -> None:
        import pandas as pd

+        # make data
        rows = 100
        cols = 10
        X = rng.randn(rows, cols)
        y = rng.randn(rows)
+
+        # Test with pandas, which has feature info.
        feature_names = ["test_feature_" + str(i) for i in range(cols)]
        X_pd = pd.DataFrame(X, columns=feature_names)
        X_pd[f"test_feature_{3}"] = X_pd.iloc[:, 3].astype(np.int32)

        Xy = xgb.DMatrix(X_pd, y)
+        assert Xy.feature_types is not None
        assert Xy.feature_types[3] == "int"
        booster = xgb.train({}, dtrain=Xy, num_boost_round=1)

@@ -513,10 +517,32 @@ class TestModels:
        assert booster.feature_types == Xy.feature_types

        with tempfile.TemporaryDirectory() as tmpdir:
-            path = tmpdir + "model.json"
+            path = tmpdir + f"model.{ext}"
            booster.save_model(path)
            booster = xgb.Booster()
            booster.load_model(path)

            assert booster.feature_names == Xy.feature_names
            assert booster.feature_types == Xy.feature_types
+
+        # Test with numpy, no feature info is set
+        Xy = xgb.DMatrix(X, y)
+        assert Xy.feature_names is None
+        assert Xy.feature_types is None
+
+        booster = xgb.train({}, dtrain=Xy, num_boost_round=1)
+        assert booster.feature_names is None
+        assert booster.feature_types is None
+
+        # test explicitly set
+        fns = [str(i) for i in range(cols)]
+        booster.feature_names = fns
+
+        assert booster.feature_names == fns
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, f"model.{ext}")
+            booster.save_model(path)
+
+            booster = xgb.Booster(model_file=path)
+            assert booster.feature_names == fns
--- a/tests/python/test_model_io.py
+++ b/tests/python/test_model_io.py
@@ -466,3 +466,33 @@ def test_with_sklearn_obj_metric() -> None:
    assert not callable(reg_2.objective)
    assert not callable(reg_2.eval_metric)
    assert reg_2.eval_metric is None
+
+
+@pytest.mark.skipif(**tm.no_sklearn())
+def test_attributes() -> None:
+    from sklearn.datasets import load_iris
+
+    X, y = load_iris(return_X_y=True)
+    clf = xgb.XGBClassifier(n_estimators=2, early_stopping_rounds=1)
+    clf.fit(X, y, eval_set=[(X, y)])
+    best_iteration = clf.get_booster().best_iteration
+    assert best_iteration is not None
+    assert clf.n_estimators is not None
+    assert best_iteration == clf.n_estimators - 1
+
+    best_iteration = clf.best_iteration
+    assert best_iteration == clf.get_booster().best_iteration
+
+    clf.get_booster().set_attr(foo="bar")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = os.path.join(tmpdir, "clf.json")
+        clf.save_model(path)
+
+        clf = xgb.XGBClassifier(n_estimators=2)
+        clf.load_model(path)
+        assert clf.n_estimators is not None
+        assert clf.get_booster().best_iteration == clf.n_estimators - 1
+        assert clf.best_iteration == clf.get_booster().best_iteration
+
+        assert clf.get_booster().attributes()["foo"] == "bar"