[breaking][skl] Remove parameter serialization. (#8963)

- Remove parameter serialization in the scikit-learn interface. The scikit-lear interface `save_model` will save only the model and discard all hyper-parameters. This is to align with the native XGBoost interface, which distinguishes the hyper-parameter and model parameters. With the scikit-learn interface, model parameters are attributes of the estimator. For instance, `n_features_in_`, `n_classes_` are always accessible with `estimator.n_features_in_` and `estimator.n_classes_`, but not with the `estimator.get_params`. - Define a `load_model` method for classifier to load its own attributes. - Set n_estimators to None by default.
2023-03-27 21:34:10 +08:00
parent 90645c4957
commit c2b3a13e70
9 changed files with 134 additions and 160 deletions
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -38,36 +38,34 @@ def test_binary_classification():
            assert err < 0.1


-@pytest.mark.parametrize('objective', ['multi:softmax', 'multi:softprob'])
+@pytest.mark.parametrize("objective", ["multi:softmax", "multi:softprob"])
 def test_multiclass_classification(objective):
    from sklearn.datasets import load_iris
    from sklearn.model_selection import KFold

    def check_pred(preds, labels, output_margin):
        if output_margin:
-            err = sum(1 for i in range(len(preds))
-                      if preds[i].argmax() != labels[i]) / float(len(preds))
+            err = sum(
+                1 for i in range(len(preds)) if preds[i].argmax() != labels[i]
+            ) / float(len(preds))
        else:
-            err = sum(1 for i in range(len(preds))
-                      if preds[i] != labels[i]) / float(len(preds))
+            err = sum(1 for i in range(len(preds)) if preds[i] != labels[i]) / float(
+                len(preds)
+            )
        assert err < 0.4

-    iris = load_iris()
-    y = iris['target']
-    X = iris['data']
+    X, y = load_iris(return_X_y=True)
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
-        xgb_model = xgb.XGBClassifier(objective=objective).fit(X[train_index], y[train_index])
-        assert (xgb_model.get_booster().num_boosted_rounds() ==
-                xgb_model.n_estimators)
+        xgb_model = xgb.XGBClassifier(objective=objective).fit(
+            X[train_index], y[train_index]
+        )
+        assert xgb_model.get_booster().num_boosted_rounds() == 100
        preds = xgb_model.predict(X[test_index])
        # test other params in XGBClassifier().fit
-        preds2 = xgb_model.predict(X[test_index], output_margin=True,
-                                   ntree_limit=3)
-        preds3 = xgb_model.predict(X[test_index], output_margin=True,
-                                   ntree_limit=0)
-        preds4 = xgb_model.predict(X[test_index], output_margin=False,
-                                   ntree_limit=3)
+        preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
+        preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
+        preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
        labels = y[test_index]

        check_pred(preds, labels, output_margin=False)
@@ -761,9 +759,9 @@ def test_parameters_access():
    clf = save_load(clf)

    assert clf.tree_method is None
-    assert clf.n_estimators == 2
+    assert clf.n_estimators is None
    assert clf.get_params()["tree_method"] is None
-    assert clf.get_params()["n_estimators"] == 2
+    assert clf.get_params()["n_estimators"] is None
    assert get_tm(clf) == "auto"  # discarded for save/load_model

    clf.set_params(tree_method="hist")
@@ -771,9 +769,7 @@ def test_parameters_access():
    clf = pickle.loads(pickle.dumps(clf))
    assert clf.get_params()["tree_method"] == "hist"
    clf = save_load(clf)
-    # FIXME(jiamingy): We should remove this behavior once we remove parameters
-    # serialization for skl save/load_model.
-    assert clf.get_params()["tree_method"] == "hist"
+    assert clf.get_params()["tree_method"] is None


 def test_kwargs_error():
@@ -902,6 +898,7 @@ def save_load_model(model_path):
        xgb_model.load_model(model_path)

        assert isinstance(xgb_model.classes_, np.ndarray)
+        np.testing.assert_equal(xgb_model.classes_, np.array([0, 1]))
        assert isinstance(xgb_model._Booster, xgb.Booster)

        preds = xgb_model.predict(X[test_index])
@@ -933,8 +930,10 @@ def test_save_load_model():
        save_load_model(model_path)

    from sklearn.datasets import load_digits
+    from sklearn.model_selection import train_test_split
+
    with tempfile.TemporaryDirectory() as tempdir:
-        model_path = os.path.join(tempdir, 'digits.model.json')
+        model_path = os.path.join(tempdir, 'digits.model.ubj')
        digits = load_digits(n_class=2)
        y = digits['target']
        X = digits['data']
@@ -959,6 +958,28 @@ def test_save_load_model():
        predt_1 = cls.predict(X)
        assert np.allclose(predt_0, predt_1)

+        # mclass
+        X, y = load_digits(n_class=10, return_X_y=True)
+        # small test_size to force early stop
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.01, random_state=1
+        )
+        clf = xgb.XGBClassifier(
+            n_estimators=64, tree_method="hist", early_stopping_rounds=2
+        )
+        clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+        score = clf.best_score
+        clf.save_model(model_path)
+
+        clf = xgb.XGBClassifier()
+        clf.load_model(model_path)
+        assert clf.classes_.size == 10
+        np.testing.assert_equal(clf.classes_, np.arange(10))
+        assert clf.n_classes_ == 10
+
+        assert clf.best_iteration == 27
+        assert clf.best_score == score
+

 def test_RFECV():
    from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris