Init estimation for regression. (#8272)

2023-01-11 02:04:56 +08:00
parent 1b58d81315
commit badeff1d74
29 changed files with 466 additions and 132 deletions
--- a/tests/python/test_early_stopping.py
+++ b/tests/python/test_early_stopping.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+from xgboost.testing.updater import get_basescore

 import xgboost as xgb
 from xgboost import testing as tm
@@ -11,16 +12,12 @@ class TestEarlyStopping:
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_early_stopping_nonparallel(self):
        from sklearn.datasets import load_digits
-        try:
-            from sklearn.model_selection import train_test_split
-        except ImportError:
-            from sklearn.cross_validation import train_test_split
+        from sklearn.model_selection import train_test_split

        digits = load_digits(n_class=2)
        X = digits['data']
        y = digits['target']
-        X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                            random_state=0)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
        clf1 = xgb.XGBClassifier(learning_rate=0.1)
        clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc",
                 eval_set=[(X_test, y_test)])
@@ -31,9 +28,23 @@ class TestEarlyStopping:
        assert clf1.best_score == clf2.best_score
        assert clf1.best_score != 1
        # check overfit
-        clf3 = xgb.XGBClassifier(learning_rate=0.1)
-        clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
-                 eval_set=[(X_test, y_test)])
+        clf3 = xgb.XGBClassifier(
+            learning_rate=0.1,
+            eval_metric="auc",
+            early_stopping_rounds=10
+        )
+        clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+        base_score = get_basescore(clf3)
+        assert 0.53 > base_score > 0.5
+
+        clf3 = xgb.XGBClassifier(
+            learning_rate=0.1,
+            base_score=.5,
+            eval_metric="auc",
+            early_stopping_rounds=10
+        )
+        clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+
        assert clf3.best_score == 1

    def evalerror(self, preds, dtrain):
--- a/tests/python/test_tree_regularization.py
+++ b/tests/python/test_tree_regularization.py
@@ -9,11 +9,13 @@ train_data = xgb.DMatrix(np.array([[1]]), label=np.array([1]))
 class TestTreeRegularization:
    def test_alpha(self):
        params = {
-            'tree_method': 'exact', 'verbosity': 0,
-            'objective': 'reg:squarederror',
-            'eta': 1,
-            'lambda': 0,
-            'alpha': 0.1
+            "tree_method": "exact",
+            "verbosity": 0,
+            "objective": "reg:squarederror",
+            "eta": 1,
+            "lambda": 0,
+            "alpha": 0.1,
+            "base_score": 0.5,
        }

        model = xgb.train(params, train_data, 1)
@@ -27,11 +29,13 @@ class TestTreeRegularization:

    def test_lambda(self):
        params = {
-            'tree_method': 'exact', 'verbosity': 0,
-            'objective': 'reg:squarederror',
-            'eta': 1,
-            'lambda': 1,
-            'alpha': 0
+            "tree_method": "exact",
+            "verbosity": 0,
+            "objective": "reg:squarederror",
+            "eta": 1,
+            "lambda": 1,
+            "alpha": 0,
+            "base_score": 0.5,
        }

        model = xgb.train(params, train_data, 1)
@@ -45,11 +49,13 @@ class TestTreeRegularization:

    def test_alpha_and_lambda(self):
        params = {
-            'tree_method': 'exact', 'verbosity': 1,
-            'objective': 'reg:squarederror',
-            'eta': 1,
-            'lambda': 1,
-            'alpha': 0.1
+            "tree_method": "exact",
+            "verbosity": 1,
+            "objective": "reg:squarederror",
+            "eta": 1,
+            "lambda": 1,
+            "alpha": 0.1,
+            "base_score": 0.5,
        }

        model = xgb.train(params, train_data, 1)
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -10,6 +10,7 @@ from xgboost.testing.params import (
    exact_parameter_strategy,
    hist_parameter_strategy,
 )
+from xgboost.testing.updater import check_init_estimation

 import xgboost as xgb
 from xgboost import testing as tm
@@ -449,3 +450,6 @@ class TestTreeMethod:
    )
    def test_adaptive(self, tree_method, weighted) -> None:
        self.run_adaptive(tree_method, weighted)
+
+    def test_init_estimation(self) -> None:
+        check_init_estimation("hist")
--- a/tests/python/test_with_shap.py
+++ b/tests/python/test_with_shap.py
@@ -9,6 +9,7 @@ except Exception:
    shap = None
    pass

+
 pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")


@@ -16,11 +17,16 @@ pytestmark = pytest.mark.skipif(shap is None, reason="Requires shap package")
 # Changes in binary format may cause problems
 def test_with_shap():
    from sklearn.datasets import fetch_california_housing
+
    X, y = fetch_california_housing(return_X_y=True)
    dtrain = xgb.DMatrix(X, label=y)
    model = xgb.train({"learning_rate": 0.01}, dtrain, 10)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    margin = model.predict(dtrain, output_margin=True)
-    assert np.allclose(np.sum(shap_values, axis=len(shap_values.shape) - 1),
-                       margin - explainer.expected_value, 1e-3, 1e-3)
+    assert np.allclose(
+        np.sum(shap_values, axis=len(shap_values.shape) - 1),
+        margin - explainer.expected_value,
+        1e-3,
+        1e-3,
+    )
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -9,6 +9,7 @@ import numpy as np
 import pytest
 from sklearn.utils.estimator_checks import parametrize_with_checks
 from xgboost.testing.shared import get_feature_weights, validate_data_initialization
+from xgboost.testing.updater import get_basescore

 import xgboost as xgb
 from xgboost import testing as tm
@@ -196,19 +197,22 @@ def test_stacking_classification():
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    clf.fit(X_train, y_train).score(X_test, y_test)

-
@pytest.mark.skipif(**tm.no_pandas())
 def test_feature_importances_weight():
    from sklearn.datasets import load_digits

    digits = load_digits(n_class=2)
-    y = digits['target']
-    X = digits['data']
+    y = digits["target"]
+    X = digits["data"]
+
+    xgb_model = xgb.XGBClassifier(
+        random_state=0,
+        tree_method="exact",
+        learning_rate=0.1,
+        importance_type="weight",
+        base_score=0.5,
+    ).fit(X, y)

-    xgb_model = xgb.XGBClassifier(random_state=0,
-                                  tree_method="exact",
-                                  learning_rate=0.1,
-                                  importance_type="weight").fit(X, y)
    exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
                    0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
                    0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0.,
@@ -223,16 +227,22 @@ def test_feature_importances_weight():
    import pandas as pd
    y = pd.Series(digits['target'])
    X = pd.DataFrame(digits['data'])
-    xgb_model = xgb.XGBClassifier(random_state=0,
-                                  tree_method="exact",
-                                  learning_rate=0.1,
-                                  importance_type="weight").fit(X, y)
+    xgb_model = xgb.XGBClassifier(
+        random_state=0,
+        tree_method="exact",
+        learning_rate=0.1,
+        base_score=.5,
+        importance_type="weight"
+    ).fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

-    xgb_model = xgb.XGBClassifier(random_state=0,
-                                  tree_method="exact",
-                                  learning_rate=0.1,
-                                  importance_type="weight").fit(X, y)
+    xgb_model = xgb.XGBClassifier(
+        random_state=0,
+        tree_method="exact",
+        learning_rate=0.1,
+        importance_type="weight",
+        base_score=.5,
+    ).fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

    with pytest.raises(ValueError):
@@ -274,6 +284,7 @@ def test_feature_importances_gain():
        random_state=0, tree_method="exact",
        learning_rate=0.1,
        importance_type="gain",
+        base_score=0.5,
    ).fit(X, y)

    exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
@@ -296,6 +307,7 @@ def test_feature_importances_gain():
        tree_method="exact",
        learning_rate=0.1,
        importance_type="gain",
+        base_score=0.5,
    ).fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

@@ -304,6 +316,7 @@ def test_feature_importances_gain():
        tree_method="exact",
        learning_rate=0.1,
        importance_type="gain",
+        base_score=0.5,
    ).fit(X, y)
    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)

@@ -593,18 +606,21 @@ def test_split_value_histograms():

    digits_2class = load_digits(n_class=2)

-    X = digits_2class['data']
-    y = digits_2class['target']
+    X = digits_2class["data"]
+    y = digits_2class["target"]

    dm = xgb.DMatrix(X, label=y)
-    params = {'max_depth': 6, 'eta': 0.01, 'verbosity': 0,
-              'objective': 'binary:logistic'}
+    params = {
+        "max_depth": 6,
+        "eta": 0.01,
+        "verbosity": 0,
+        "objective": "binary:logistic",
+        "base_score": 0.5,
+    }

    gbdt = xgb.train(params, dm, num_boost_round=10)
-    assert gbdt.get_split_value_histogram("not_there",
-                                          as_pandas=True).shape[0] == 0
-    assert gbdt.get_split_value_histogram("not_there",
-                                          as_pandas=False).shape[0] == 0
+    assert gbdt.get_split_value_histogram("not_there", as_pandas=True).shape[0] == 0
+    assert gbdt.get_split_value_histogram("not_there", as_pandas=False).shape[0] == 0
    assert gbdt.get_split_value_histogram("f28", bins=0).shape[0] == 1
    assert gbdt.get_split_value_histogram("f28", bins=1).shape[0] == 1
    assert gbdt.get_split_value_histogram("f28", bins=2).shape[0] == 2
@@ -748,11 +764,7 @@ def test_sklearn_get_default_params():
    cls = xgb.XGBClassifier()
    assert cls.get_params()["base_score"] is None
    cls.fit(X[:4, ...], y[:4, ...])
-    base_score = float(
-        json.loads(cls.get_booster().save_config())["learner"]["learner_model_param"][
-            "base_score"
-        ]
-    )
+    base_score = get_basescore(cls)
    np.testing.assert_equal(base_score, 0.5)