Calculate base_score based on input labels for mae. (#8107)

Fit an intercept as base score for abs loss.
2022-09-20 20:53:54 +08:00
parent 4f42aa5f12
commit fffb1fca52
42 changed files with 999 additions and 343 deletions
--- a/tests/python/test_model_compatibility.py
+++ b/tests/python/test_model_compatibility.py
@@ -102,34 +102,38 @@ def run_scikit_model_check(name, path):

@pytest.mark.skipif(**tm.no_sklearn())
 def test_model_compatibility():
-    '''Test model compatibility, can only be run on CI as others don't
+    """Test model compatibility, can only be run on CI as others don't
    have the credentials.

-    '''
+    """
    path = os.path.dirname(os.path.abspath(__file__))
-    path = os.path.join(path, 'models')
+    path = os.path.join(path, "models")

-    zip_path, _ = urllib.request.urlretrieve('https://xgboost-ci-jenkins-artifacts.s3-us-west-2' +
-                                             '.amazonaws.com/xgboost_model_compatibility_test.zip')
-    with zipfile.ZipFile(zip_path, 'r') as z:
-        z.extractall(path)
+    if not os.path.exists(path):
+        zip_path, _ = urllib.request.urlretrieve(
+            "https://xgboost-ci-jenkins-artifacts.s3-us-west-2"
+            + ".amazonaws.com/xgboost_model_compatibility_test.zip"
+        )
+        with zipfile.ZipFile(zip_path, "r") as z:
+            z.extractall(path)

    models = [
-        os.path.join(root, f) for root, subdir, files in os.walk(path)
+        os.path.join(root, f)
+        for root, subdir, files in os.walk(path)
        for f in files
-        if f != 'version'
+        if f != "version"
    ]
    assert models

    for path in models:
        name = os.path.basename(path)
-        if name.startswith('xgboost-'):
+        if name.startswith("xgboost-"):
            booster = xgboost.Booster(model_file=path)
            run_booster_check(booster, name)
            # Do full serialization.
            booster = copy.copy(booster)
            run_booster_check(booster, name)
-        elif name.startswith('xgboost_scikit'):
+        elif name.startswith("xgboost_scikit"):
            run_scikit_model_check(name, path)
        else:
            assert False
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -1,4 +1,4 @@
-from random import choice
+import json
 from string import ascii_lowercase
 from typing import Dict, Any
 import testing as tm
@@ -397,3 +397,72 @@ class TestTreeMethod:
    def test_categorical_missing(self, rows, cols, cats):
        self.run_categorical_missing(rows, cols, cats, "approx")
        self.run_categorical_missing(rows, cols, cats, "hist")
+
+    def run_adaptive(self, tree_method, weighted) -> None:
+        rng = np.random.RandomState(1994)
+        from sklearn.datasets import make_regression
+        from sklearn.utils import stats
+
+        n_samples = 256
+        X, y = make_regression(n_samples, 16, random_state=rng)
+        if weighted:
+            w = rng.normal(size=n_samples)
+            w -= w.min()
+            Xy = xgb.DMatrix(X, y, weight=w)
+            base_score = stats._weighted_percentile(y, w, percentile=50)
+        else:
+            Xy = xgb.DMatrix(X, y)
+            base_score = np.median(y)
+
+        booster_0 = xgb.train(
+            {
+                "tree_method": tree_method,
+                "base_score": base_score,
+                "objective": "reg:absoluteerror",
+            },
+            Xy,
+            num_boost_round=1,
+        )
+        booster_1 = xgb.train(
+            {"tree_method": tree_method, "objective": "reg:absoluteerror"},
+            Xy,
+            num_boost_round=1,
+        )
+        config_0 = json.loads(booster_0.save_config())
+        config_1 = json.loads(booster_1.save_config())
+
+        def get_score(config: Dict) -> float:
+            return float(config["learner"]["learner_model_param"]["base_score"])
+
+        assert get_score(config_0) == get_score(config_1)
+
+        raw_booster = booster_1.save_raw(raw_format="deprecated")
+        booster_2 = xgb.Booster(model_file=raw_booster)
+        config_2 = json.loads(booster_2.save_config())
+        assert get_score(config_1) == get_score(config_2)
+
+        raw_booster = booster_1.save_raw(raw_format="ubj")
+        booster_2 = xgb.Booster(model_file=raw_booster)
+        config_2 = json.loads(booster_2.save_config())
+        assert get_score(config_1) == get_score(config_2)
+
+        booster_0 = xgb.train(
+            {
+                "tree_method": tree_method,
+                "base_score": base_score + 1.0,
+                "objective": "reg:absoluteerror",
+            },
+            Xy,
+            num_boost_round=1,
+        )
+        config_0 = json.loads(booster_0.save_config())
+        np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
+
+    @pytest.mark.skipif(**tm.no_sklearn())
+    @pytest.mark.parametrize(
+        "tree_method,weighted", [
+            ("approx", False), ("hist", False), ("approx", True), ("hist", True)
+        ]
+    )
+    def test_adaptive(self, tree_method, weighted) -> None:
+        self.run_adaptive(tree_method, weighted)
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -1537,13 +1537,56 @@ class TestWithDask:
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.gtest
    def test_quantile_same_on_all_workers(self) -> None:
-        self.run_quantile('SameOnAllWorkers')
+        self.run_quantile("SameOnAllWorkers")
+
+    def test_adaptive(self) -> None:
+        def get_score(config: Dict) -> float:
+            return float(config["learner"]["learner_model_param"]["base_score"])
+
+        def local_test(rabit_args: List[bytes], worker_id: int) -> bool:
+            with xgb.dask.RabitContext(rabit_args):
+                if worker_id == 0:
+                    y = np.array([0.0, 0.0, 0.0])
+                    x = np.array([[0.0]] * 3)
+                else:
+                    y = np.array([1000.0])
+                    x = np.array(
+                        [
+                            [0.0],
+                        ]
+                    )
+
+                Xy = xgb.DMatrix(x, y)
+                booster = xgb.train(
+                    {"tree_method": "hist", "objective": "reg:absoluteerror"},
+                    Xy,
+                    num_boost_round=1,
+                )
+                config = json.loads(booster.save_config())
+                base_score = get_score(config)
+                assert base_score == 250.0
+                return True
+
+        with LocalCluster(n_workers=2, dashboard_address=":0") as cluster:
+            with Client(cluster) as client:
+                workers = _get_client_workers(client)
+                rabit_args = client.sync(
+                    xgb.dask._get_rabit_args, len(workers), None, client
+                )
+                futures = []
+                for i, _ in enumerate(workers):
+                    f = client.submit(local_test, rabit_args, i)
+                    futures.append(f)
+
+                results = client.gather(futures)
+                assert all(results)

    def test_n_workers(self) -> None:
        with LocalCluster(n_workers=2, dashboard_address=":0") as cluster:
            with Client(cluster) as client:
                workers = _get_client_workers(client)
                from sklearn.datasets import load_breast_cancer
+
                X, y = load_breast_cancer(return_X_y=True)
                dX = client.submit(da.from_array, X, workers=[workers[0]]).result()
                dy = client.submit(da.from_array, y, workers=[workers[0]]).result()