Add base_margin for evaluation dataset. (#6591)
* Add base margin to evaluation datasets. * Unify the code base for evaluation matrices.
This commit is contained in:
@@ -17,7 +17,7 @@ import subprocess
|
||||
import hypothesis
|
||||
from hypothesis import given, settings, note, HealthCheck
|
||||
from test_updaters import hist_parameter_strategy, exact_parameter_strategy
|
||||
from test_with_sklearn import run_feature_weights
|
||||
from test_with_sklearn import run_feature_weights, run_data_initialization
|
||||
|
||||
if sys.platform.startswith("win"):
|
||||
pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
|
||||
@@ -176,6 +176,22 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
|
||||
|
||||
assert np.all(predictions_1.compute() == predictions_2.compute())
|
||||
|
||||
margined = xgb.dask.DaskXGBClassifier(n_estimators=4)
|
||||
margined.fit(
|
||||
X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin]
|
||||
)
|
||||
|
||||
unmargined = xgb.dask.DaskXGBClassifier(n_estimators=4)
|
||||
unmargined.fit(X=X, y=y, eval_set=[(X, y)], base_margin=margin)
|
||||
|
||||
margined_res = margined.evals_result()['validation_0']['logloss']
|
||||
unmargined_res = unmargined.evals_result()['validation_0']['logloss']
|
||||
|
||||
assert len(margined_res) == len(unmargined_res)
|
||||
for i in range(len(margined_res)):
|
||||
# margined is correct one, so smaller error.
|
||||
assert margined_res[i] < unmargined_res[i]
|
||||
|
||||
|
||||
def test_dask_missing_value_reg(client: "Client") -> None:
|
||||
X_0 = np.ones((20 // 2, kCols))
|
||||
@@ -955,7 +971,7 @@ class TestWithDask:
|
||||
results_native['validation_0']['rmse'])
|
||||
tm.non_increasing(results_native['validation_0']['rmse'])
|
||||
|
||||
def test_data_initialization(self) -> None:
|
||||
def test_no_duplicated_partition(self) -> None:
|
||||
'''Assert each worker has the correct amount of data, and DMatrix initialization doesn't
|
||||
generate unnecessary copies of data.
|
||||
|
||||
@@ -995,6 +1011,13 @@ class TestWithDask:
|
||||
# Subtract the on disk resource from each worker
|
||||
assert cnt - n_workers == n_partitions
|
||||
|
||||
def test_data_initialization(self, client: "Client") -> None:
|
||||
"""assert that we don't create duplicated DMatrix"""
|
||||
from sklearn.datasets import load_digits
|
||||
X, y = load_digits(return_X_y=True)
|
||||
X, y = dd.from_array(X, chunksize=32), dd.from_array(y, chunksize=32)
|
||||
run_data_initialization(xgb.dask.DaskDMatrix, xgb.dask.DaskXGBClassifier, X, y)
|
||||
|
||||
def run_shap(self, X: Any, y: Any, params: Dict[str, Any], client: "Client") -> None:
|
||||
X, y = da.from_array(X, chunks=(32, -1)), da.from_array(y, chunks=32)
|
||||
Xy = xgb.dask.DaskDMatrix(client, X, y)
|
||||
|
||||
@@ -717,13 +717,13 @@ def test_validation_weights_xgbmodel():
|
||||
assert all((logloss_with_weights[i] != logloss_without_weights[i]
|
||||
for i in [0, 1]))
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
with pytest.raises(ValueError):
|
||||
# length of eval set and sample weight doesn't match.
|
||||
clf.fit(X_train, y_train, sample_weight=weights_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
sample_weight_eval_set=[weights_train])
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
with pytest.raises(ValueError):
|
||||
cls = xgb.XGBClassifier()
|
||||
cls.fit(X_train, y_train, sample_weight=weights_train,
|
||||
eval_set=[(X_train, y_train), (X_test, y_test)],
|
||||
@@ -1118,19 +1118,9 @@ def run_boost_from_prediction(tree_method):
|
||||
assert np.all(predictions_1 == predictions_2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_boost_from_prediction_hist():
|
||||
run_boost_from_prediction('hist')
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_boost_from_prediction_approx():
|
||||
run_boost_from_prediction('approx')
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_boost_from_prediction_exact():
|
||||
run_boost_from_prediction('exact')
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
|
||||
def test_boost_from_prediction(tree_method):
|
||||
run_boost_from_prediction(tree_method)
|
||||
|
||||
|
||||
def test_estimator_type():
|
||||
@@ -1154,3 +1144,32 @@ def test_estimator_type():
|
||||
|
||||
cls = xgb.XGBClassifier()
|
||||
cls.load_model(path) # no error
|
||||
|
||||
|
||||
def run_data_initialization(DMatrix, model, X, y):
|
||||
"""Assert that we don't create duplicated DMatrix."""
|
||||
|
||||
old_init = DMatrix.__init__
|
||||
count = [0]
|
||||
|
||||
def new_init(self, **kwargs):
|
||||
count[0] += 1
|
||||
return old_init(self, **kwargs)
|
||||
|
||||
DMatrix.__init__ = new_init
|
||||
model(n_estimators=1).fit(X, y, eval_set=[(X, y)])
|
||||
|
||||
assert count[0] == 1
|
||||
count[0] = 0 # only 1 DMatrix is created.
|
||||
|
||||
y_copy = y.copy()
|
||||
model(n_estimators=1).fit(X, y, eval_set=[(X, y_copy)])
|
||||
assert count[0] == 2 # a different Python object is considered different
|
||||
|
||||
DMatrix.__init__ = old_init
|
||||
|
||||
|
||||
def test_data_initialization():
|
||||
from sklearn.datasets import load_digits
|
||||
X, y = load_digits(return_X_y=True)
|
||||
run_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)
|
||||
|
||||
Reference in New Issue
Block a user