Support multi-class with base margin. (#7381)

This is already partially supported but never properly tested. So the only possible way to use it is calling `numpy.ndarray.flatten` with `base_margin` before passing it into XGBoost. This PR adds proper support
for most of the data types along with tests.
This commit is contained in:
Jiaming Yuan
2021-11-02 13:38:00 +08:00
committed by GitHub
parent 6295dc3b67
commit a13321148a
18 changed files with 274 additions and 92 deletions

View File

@@ -7,7 +7,7 @@ import sys
import numpy as np
import scipy
import json
from typing import List, Tuple, Dict, Optional, Type, Any
from typing import List, Tuple, Dict, Optional, Type, Any, Callable
import asyncio
from functools import partial
from concurrent.futures import ThreadPoolExecutor
@@ -182,6 +182,50 @@ def test_dask_predict_shape_infer(client: "Client") -> None:
assert prediction.shape[1] == 3
def run_boost_from_prediction_multi_clasas(
X: xgb.dask._DaskCollection,
y: xgb.dask._DaskCollection,
tree_method: str,
client: "Client"
) -> None:
model_0 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_0.fit(X=X, y=y)
margin = xgb.dask.inplace_predict(
client, model_0.get_booster(), X, predict_type="margin"
)
model_1 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_1.fit(X=X, y=y, base_margin=margin)
predictions_1 = xgb.dask.predict(
client,
model_1.get_booster(),
xgb.dask.DaskDMatrix(client, X, base_margin=margin),
output_margin=True
)
model_2 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
)
model_2.fit(X=X, y=y)
predictions_2 = xgb.dask.inplace_predict(
client, model_2.get_booster(), X, predict_type="margin"
)
a = predictions_1.compute()
b = predictions_2.compute()
# cupy/cudf
if hasattr(a, "get"):
a = a.get()
if hasattr(b, "values"):
b = b.values
if hasattr(b, "get"):
b = b.get()
np.testing.assert_allclose(a, b, atol=1e-5)
def run_boost_from_prediction(
X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client"
) -> None:
@@ -227,11 +271,15 @@ def run_boost_from_prediction(
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_breast_cancer, load_digits
X_, y_ = load_breast_cancer(return_X_y=True)
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
run_boost_from_prediction(X, y, tree_method, client)
X_, y_ = load_digits(return_X_y=True)
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
run_boost_from_prediction_multi_clasas(X, y, tree_method, client)
def test_inplace_predict(client: "Client") -> None:
from sklearn.datasets import load_boston