Init estimation for regression. (#8272)

2023-01-11 02:04:56 +08:00
parent 1b58d81315
commit badeff1d74
29 changed files with 466 additions and 132 deletions
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1078,7 +1078,7 @@ class XGBModel(XGBModelBase):
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
-    ) -> np.ndarray:
+    ) -> ArrayLike:
        """Predict with `X`.  If the model is trained with early stopping, then `best_iteration`
        is used automatically.  For tree models, when data is on GPU, like cupy array or
        cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
@@ -1528,7 +1528,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        validate_features: bool = True,
        base_margin: Optional[ArrayLike] = None,
        iteration_range: Optional[Tuple[int, int]] = None,
-    ) -> np.ndarray:
+    ) -> ArrayLike:
        with config_context(verbosity=self.verbosity):
            class_probs = super().predict(
                X=X,
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -0,0 +1,54 @@
+"""Tests for dask shared by different test modules."""
+import numpy as np
+from dask import array as da
+from distributed import Client
+from xgboost.testing.updater import get_basescore
+
+import xgboost as xgb
+
+
+def check_init_estimation_clf(tree_method: str, client: Client) -> None:
+    """Test init estimation for classsifier."""
+    from sklearn.datasets import make_classification
+
+    X, y = make_classification(n_samples=4096 * 2, n_features=32, random_state=1994)
+    clf = xgb.XGBClassifier(n_estimators=1, max_depth=1, tree_method=tree_method)
+    clf.fit(X, y)
+    base_score = get_basescore(clf)
+
+    dx = da.from_array(X).rechunk(chunks=(32, None))
+    dy = da.from_array(y).rechunk(chunks=(32,))
+    dclf = xgb.dask.DaskXGBClassifier(
+        n_estimators=1, max_depth=1, tree_method=tree_method
+    )
+    dclf.client = client
+    dclf.fit(dx, dy)
+    dbase_score = get_basescore(dclf)
+    np.testing.assert_allclose(base_score, dbase_score)
+
+
+def check_init_estimation_reg(tree_method: str, client: Client) -> None:
+    """Test init estimation for regressor."""
+    from sklearn.datasets import make_regression
+
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(n_samples=4096 * 2, n_features=32, random_state=1994)
+    reg = xgb.XGBRegressor(n_estimators=1, max_depth=1, tree_method=tree_method)
+    reg.fit(X, y)
+    base_score = get_basescore(reg)
+
+    dx = da.from_array(X).rechunk(chunks=(32, None))
+    dy = da.from_array(y).rechunk(chunks=(32,))
+    dreg = xgb.dask.DaskXGBRegressor(
+        n_estimators=1, max_depth=1, tree_method=tree_method
+    )
+    dreg.client = client
+    dreg.fit(dx, dy)
+    dbase_score = get_basescore(dreg)
+    np.testing.assert_allclose(base_score, dbase_score)
+
+
+def check_init_estimation(tree_method: str, client: Client) -> None:
+    """Test init estimation."""
+    check_init_estimation_reg(tree_method, client)
+    check_init_estimation_clf(tree_method, client)
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -0,0 +1,70 @@
+"""Tests for updaters."""
+import json
+
+import numpy as np
+
+import xgboost as xgb
+
+
+def get_basescore(model: xgb.XGBModel) -> float:
+    """Get base score from an XGBoost sklearn estimator."""
+    base_score = float(
+        json.loads(model.get_booster().save_config())["learner"]["learner_model_param"][
+            "base_score"
+        ]
+    )
+    return base_score
+
+
+def check_init_estimation(tree_method: str) -> None:
+    """Test for init estimation."""
+    from sklearn.datasets import (
+        make_classification,
+        make_multilabel_classification,
+        make_regression,
+    )
+
+    def run_reg(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-name
+        reg = xgb.XGBRegressor(tree_method=tree_method, max_depth=1, n_estimators=1)
+        reg.fit(X, y, eval_set=[(X, y)])
+        base_score_0 = get_basescore(reg)
+        score_0 = reg.evals_result()["validation_0"]["rmse"][0]
+
+        reg = xgb.XGBRegressor(
+            tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
+        )
+        reg.fit(X, y, eval_set=[(X, y)])
+        base_score_1 = get_basescore(reg)
+        score_1 = reg.evals_result()["validation_0"]["rmse"][0]
+        assert not np.isclose(base_score_0, base_score_1)
+        assert score_0 < score_1  # should be better
+
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(n_samples=4096, random_state=17)
+    run_reg(X, y)
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(n_samples=4096, n_targets=3, random_state=17)
+    run_reg(X, y)
+
+    def run_clf(X: np.ndarray, y: np.ndarray) -> None:  # pylint: disable=invalid-name
+        clf = xgb.XGBClassifier(tree_method=tree_method, max_depth=1, n_estimators=1)
+        clf.fit(X, y, eval_set=[(X, y)])
+        base_score_0 = get_basescore(clf)
+        score_0 = clf.evals_result()["validation_0"]["logloss"][0]
+
+        clf = xgb.XGBClassifier(
+            tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
+        )
+        clf.fit(X, y, eval_set=[(X, y)])
+        base_score_1 = get_basescore(clf)
+        score_1 = clf.evals_result()["validation_0"]["logloss"][0]
+        assert not np.isclose(base_score_0, base_score_1)
+        assert score_0 < score_1  # should be better
+
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_classification(n_samples=4096, random_state=17)
+    run_clf(X, y)
+    X, y = make_multilabel_classification(
+        n_samples=4096, n_labels=3, n_classes=5, random_state=17
+    )
+    run_clf(X, y)