Calculate base_score based on input labels for mae. (#8107)

Fit an intercept as base score for abs loss.
This commit is contained in:
Jiaming Yuan
2022-09-20 20:53:54 +08:00
committed by GitHub
parent 4f42aa5f12
commit fffb1fca52
42 changed files with 999 additions and 343 deletions

View File

@@ -102,34 +102,38 @@ def run_scikit_model_check(name, path):
@pytest.mark.skipif(**tm.no_sklearn())
def test_model_compatibility():
'''Test model compatibility, can only be run on CI as others don't
"""Test model compatibility, can only be run on CI as others don't
have the credentials.
'''
"""
path = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(path, 'models')
path = os.path.join(path, "models")
zip_path, _ = urllib.request.urlretrieve('https://xgboost-ci-jenkins-artifacts.s3-us-west-2' +
'.amazonaws.com/xgboost_model_compatibility_test.zip')
with zipfile.ZipFile(zip_path, 'r') as z:
z.extractall(path)
if not os.path.exists(path):
zip_path, _ = urllib.request.urlretrieve(
"https://xgboost-ci-jenkins-artifacts.s3-us-west-2"
+ ".amazonaws.com/xgboost_model_compatibility_test.zip"
)
with zipfile.ZipFile(zip_path, "r") as z:
z.extractall(path)
models = [
os.path.join(root, f) for root, subdir, files in os.walk(path)
os.path.join(root, f)
for root, subdir, files in os.walk(path)
for f in files
if f != 'version'
if f != "version"
]
assert models
for path in models:
name = os.path.basename(path)
if name.startswith('xgboost-'):
if name.startswith("xgboost-"):
booster = xgboost.Booster(model_file=path)
run_booster_check(booster, name)
# Do full serialization.
booster = copy.copy(booster)
run_booster_check(booster, name)
elif name.startswith('xgboost_scikit'):
elif name.startswith("xgboost_scikit"):
run_scikit_model_check(name, path)
else:
assert False

View File

@@ -1,4 +1,4 @@
from random import choice
import json
from string import ascii_lowercase
from typing import Dict, Any
import testing as tm
@@ -397,3 +397,72 @@ class TestTreeMethod:
def test_categorical_missing(self, rows, cols, cats):
self.run_categorical_missing(rows, cols, cats, "approx")
self.run_categorical_missing(rows, cols, cats, "hist")
def run_adaptive(self, tree_method, weighted) -> None:
rng = np.random.RandomState(1994)
from sklearn.datasets import make_regression
from sklearn.utils import stats
n_samples = 256
X, y = make_regression(n_samples, 16, random_state=rng)
if weighted:
w = rng.normal(size=n_samples)
w -= w.min()
Xy = xgb.DMatrix(X, y, weight=w)
base_score = stats._weighted_percentile(y, w, percentile=50)
else:
Xy = xgb.DMatrix(X, y)
base_score = np.median(y)
booster_0 = xgb.train(
{
"tree_method": tree_method,
"base_score": base_score,
"objective": "reg:absoluteerror",
},
Xy,
num_boost_round=1,
)
booster_1 = xgb.train(
{"tree_method": tree_method, "objective": "reg:absoluteerror"},
Xy,
num_boost_round=1,
)
config_0 = json.loads(booster_0.save_config())
config_1 = json.loads(booster_1.save_config())
def get_score(config: Dict) -> float:
return float(config["learner"]["learner_model_param"]["base_score"])
assert get_score(config_0) == get_score(config_1)
raw_booster = booster_1.save_raw(raw_format="deprecated")
booster_2 = xgb.Booster(model_file=raw_booster)
config_2 = json.loads(booster_2.save_config())
assert get_score(config_1) == get_score(config_2)
raw_booster = booster_1.save_raw(raw_format="ubj")
booster_2 = xgb.Booster(model_file=raw_booster)
config_2 = json.loads(booster_2.save_config())
assert get_score(config_1) == get_score(config_2)
booster_0 = xgb.train(
{
"tree_method": tree_method,
"base_score": base_score + 1.0,
"objective": "reg:absoluteerror",
},
Xy,
num_boost_round=1,
)
config_0 = json.loads(booster_0.save_config())
np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
@pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize(
"tree_method,weighted", [
("approx", False), ("hist", False), ("approx", True), ("hist", True)
]
)
def test_adaptive(self, tree_method, weighted) -> None:
self.run_adaptive(tree_method, weighted)

View File

@@ -1537,13 +1537,56 @@ class TestWithDask:
@pytest.mark.skipif(**tm.no_dask())
@pytest.mark.gtest
def test_quantile_same_on_all_workers(self) -> None:
self.run_quantile('SameOnAllWorkers')
self.run_quantile("SameOnAllWorkers")
def test_adaptive(self) -> None:
def get_score(config: Dict) -> float:
return float(config["learner"]["learner_model_param"]["base_score"])
def local_test(rabit_args: List[bytes], worker_id: int) -> bool:
with xgb.dask.RabitContext(rabit_args):
if worker_id == 0:
y = np.array([0.0, 0.0, 0.0])
x = np.array([[0.0]] * 3)
else:
y = np.array([1000.0])
x = np.array(
[
[0.0],
]
)
Xy = xgb.DMatrix(x, y)
booster = xgb.train(
{"tree_method": "hist", "objective": "reg:absoluteerror"},
Xy,
num_boost_round=1,
)
config = json.loads(booster.save_config())
base_score = get_score(config)
assert base_score == 250.0
return True
with LocalCluster(n_workers=2, dashboard_address=":0") as cluster:
with Client(cluster) as client:
workers = _get_client_workers(client)
rabit_args = client.sync(
xgb.dask._get_rabit_args, len(workers), None, client
)
futures = []
for i, _ in enumerate(workers):
f = client.submit(local_test, rabit_args, i)
futures.append(f)
results = client.gather(futures)
assert all(results)
def test_n_workers(self) -> None:
with LocalCluster(n_workers=2, dashboard_address=":0") as cluster:
with Client(cluster) as client:
workers = _get_client_workers(client)
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
dX = client.submit(da.from_array, X, workers=[workers[0]]).result()
dy = client.submit(da.from_array, y, workers=[workers[0]]).result()