Init estimation for regression. (#8272)

This commit is contained in:
Jiaming Yuan
2023-01-11 02:04:56 +08:00
committed by GitHub
parent 1b58d81315
commit badeff1d74
29 changed files with 466 additions and 132 deletions

View File

@@ -1078,7 +1078,7 @@ class XGBModel(XGBModelBase):
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
) -> ArrayLike:
"""Predict with `X`. If the model is trained with early stopping, then `best_iteration`
is used automatically. For tree models, when data is on GPU, like cupy array or
cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
@@ -1528,7 +1528,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
validate_features: bool = True,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
) -> ArrayLike:
with config_context(verbosity=self.verbosity):
class_probs = super().predict(
X=X,

View File

@@ -0,0 +1,54 @@
"""Tests for dask shared by different test modules."""
import numpy as np
from dask import array as da
from distributed import Client
from xgboost.testing.updater import get_basescore
import xgboost as xgb
def check_init_estimation_clf(tree_method: str, client: Client) -> None:
"""Test init estimation for classsifier."""
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=4096 * 2, n_features=32, random_state=1994)
clf = xgb.XGBClassifier(n_estimators=1, max_depth=1, tree_method=tree_method)
clf.fit(X, y)
base_score = get_basescore(clf)
dx = da.from_array(X).rechunk(chunks=(32, None))
dy = da.from_array(y).rechunk(chunks=(32,))
dclf = xgb.dask.DaskXGBClassifier(
n_estimators=1, max_depth=1, tree_method=tree_method
)
dclf.client = client
dclf.fit(dx, dy)
dbase_score = get_basescore(dclf)
np.testing.assert_allclose(base_score, dbase_score)
def check_init_estimation_reg(tree_method: str, client: Client) -> None:
"""Test init estimation for regressor."""
from sklearn.datasets import make_regression
# pylint: disable=unbalanced-tuple-unpacking
X, y = make_regression(n_samples=4096 * 2, n_features=32, random_state=1994)
reg = xgb.XGBRegressor(n_estimators=1, max_depth=1, tree_method=tree_method)
reg.fit(X, y)
base_score = get_basescore(reg)
dx = da.from_array(X).rechunk(chunks=(32, None))
dy = da.from_array(y).rechunk(chunks=(32,))
dreg = xgb.dask.DaskXGBRegressor(
n_estimators=1, max_depth=1, tree_method=tree_method
)
dreg.client = client
dreg.fit(dx, dy)
dbase_score = get_basescore(dreg)
np.testing.assert_allclose(base_score, dbase_score)
def check_init_estimation(tree_method: str, client: Client) -> None:
"""Test init estimation."""
check_init_estimation_reg(tree_method, client)
check_init_estimation_clf(tree_method, client)

View File

@@ -0,0 +1,70 @@
"""Tests for updaters."""
import json
import numpy as np
import xgboost as xgb
def get_basescore(model: xgb.XGBModel) -> float:
"""Get base score from an XGBoost sklearn estimator."""
base_score = float(
json.loads(model.get_booster().save_config())["learner"]["learner_model_param"][
"base_score"
]
)
return base_score
def check_init_estimation(tree_method: str) -> None:
"""Test for init estimation."""
from sklearn.datasets import (
make_classification,
make_multilabel_classification,
make_regression,
)
def run_reg(X: np.ndarray, y: np.ndarray) -> None: # pylint: disable=invalid-name
reg = xgb.XGBRegressor(tree_method=tree_method, max_depth=1, n_estimators=1)
reg.fit(X, y, eval_set=[(X, y)])
base_score_0 = get_basescore(reg)
score_0 = reg.evals_result()["validation_0"]["rmse"][0]
reg = xgb.XGBRegressor(
tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
)
reg.fit(X, y, eval_set=[(X, y)])
base_score_1 = get_basescore(reg)
score_1 = reg.evals_result()["validation_0"]["rmse"][0]
assert not np.isclose(base_score_0, base_score_1)
assert score_0 < score_1 # should be better
# pylint: disable=unbalanced-tuple-unpacking
X, y = make_regression(n_samples=4096, random_state=17)
run_reg(X, y)
# pylint: disable=unbalanced-tuple-unpacking
X, y = make_regression(n_samples=4096, n_targets=3, random_state=17)
run_reg(X, y)
def run_clf(X: np.ndarray, y: np.ndarray) -> None: # pylint: disable=invalid-name
clf = xgb.XGBClassifier(tree_method=tree_method, max_depth=1, n_estimators=1)
clf.fit(X, y, eval_set=[(X, y)])
base_score_0 = get_basescore(clf)
score_0 = clf.evals_result()["validation_0"]["logloss"][0]
clf = xgb.XGBClassifier(
tree_method=tree_method, max_depth=1, n_estimators=1, boost_from_average=0
)
clf.fit(X, y, eval_set=[(X, y)])
base_score_1 = get_basescore(clf)
score_1 = clf.evals_result()["validation_0"]["logloss"][0]
assert not np.isclose(base_score_0, base_score_1)
assert score_0 < score_1 # should be better
# pylint: disable=unbalanced-tuple-unpacking
X, y = make_classification(n_samples=4096, random_state=17)
run_clf(X, y)
X, y = make_multilabel_classification(
n_samples=4096, n_labels=3, n_classes=5, random_state=17
)
run_clf(X, y)