Support multi-class with base margin. (#7381)
This is already partially supported but never properly tested. So the only possible way to use it is calling `numpy.ndarray.flatten` with `base_margin` before passing it into XGBoost. This PR adds proper support for most of the data types along with tests.
This commit is contained in:
@@ -252,6 +252,8 @@ TEST(MetaInfo, Validate) {
|
||||
EXPECT_THROW(info.Validate(1), dmlc::Error);
|
||||
|
||||
xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
|
||||
d_groups.SetDevice(0);
|
||||
d_groups.DevicePointer(); // pull to device
|
||||
auto arr_interface = xgboost::GetArrayInterface(&d_groups, 64, 1);
|
||||
std::string arr_interface_str;
|
||||
xgboost::Json::Dump(arr_interface, &arr_interface_str);
|
||||
|
||||
@@ -5,6 +5,7 @@ import pytest
|
||||
|
||||
sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
from test_dmatrix import set_base_margin_info
|
||||
|
||||
|
||||
def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
|
||||
@@ -142,6 +143,8 @@ def _test_cudf_metainfo(DMatrixT):
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
|
||||
set_base_margin_info(df, DMatrixT, "gpu_hist")
|
||||
|
||||
|
||||
class TestFromColumnar:
|
||||
'''Tests for constructing DMatrix from data structure conforming Apache
|
||||
|
||||
@@ -5,6 +5,7 @@ import pytest
|
||||
|
||||
sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
from test_dmatrix import set_base_margin_info
|
||||
|
||||
|
||||
def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN):
|
||||
@@ -107,6 +108,8 @@ def _test_cupy_metainfo(DMatrixT):
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'),
|
||||
dmat_cupy.get_uint_info('group_ptr'))
|
||||
|
||||
set_base_margin_info(cp.asarray, DMatrixT, "gpu_hist")
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
|
||||
@@ -22,6 +22,7 @@ from test_with_dask import run_empty_dmatrix_reg # noqa
|
||||
from test_with_dask import run_empty_dmatrix_auc # noqa
|
||||
from test_with_dask import run_auc # noqa
|
||||
from test_with_dask import run_boost_from_prediction # noqa
|
||||
from test_with_dask import run_boost_from_prediction_multi_clasas # noqa
|
||||
from test_with_dask import run_dask_classifier # noqa
|
||||
from test_with_dask import run_empty_dmatrix_cls # noqa
|
||||
from test_with_dask import _get_client_workers # noqa
|
||||
@@ -297,13 +298,18 @@ def run_gpu_hist(
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None:
|
||||
import cudf
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.datasets import load_breast_cancer, load_digits
|
||||
with Client(local_cuda_cluster) as client:
|
||||
X_, y_ = load_breast_cancer(return_X_y=True)
|
||||
X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
|
||||
y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
|
||||
run_boost_from_prediction(X, y, "gpu_hist", client)
|
||||
|
||||
X_, y_ = load_digits(return_X_y=True)
|
||||
X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
|
||||
y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
|
||||
run_boost_from_prediction_multi_clasas(X, y, "gpu_hist", client)
|
||||
|
||||
|
||||
class TestDistributedGPU:
|
||||
@pytest.mark.skipif(**tm.no_dask())
|
||||
|
||||
@@ -35,8 +35,25 @@ def test_gpu_binary_classification():
|
||||
assert err < 0.1
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_boost_from_prediction_gpu_hist():
|
||||
twskl.run_boost_from_prediction('gpu_hist')
|
||||
from sklearn.datasets import load_breast_cancer, load_digits
|
||||
import cupy as cp
|
||||
import cudf
|
||||
|
||||
tree_method = "gpu_hist"
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X, y = cp.array(X), cp.array(y)
|
||||
|
||||
twskl.run_boost_from_prediction_binary(tree_method, X, y, None)
|
||||
twskl.run_boost_from_prediction_binary(tree_method, X, y, cudf.DataFrame)
|
||||
|
||||
X, y = load_digits(return_X_y=True)
|
||||
X, y = cp.array(X), cp.array(y)
|
||||
|
||||
twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
|
||||
twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, cudf.DataFrame)
|
||||
|
||||
|
||||
def test_num_parallel_tree():
|
||||
|
||||
@@ -15,6 +15,24 @@ dpath = 'demo/data/'
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
def set_base_margin_info(DType, DMatrixT, tm: str):
|
||||
rng = np.random.default_rng()
|
||||
X = DType(rng.normal(0, 1.0, size=100).reshape(50, 2))
|
||||
if hasattr(X, "iloc"):
|
||||
y = X.iloc[:, 0]
|
||||
else:
|
||||
y = X[:, 0]
|
||||
base_margin = X
|
||||
# no error at set
|
||||
Xy = DMatrixT(X, y, base_margin=base_margin)
|
||||
# Error at train, caused by check in predictor.
|
||||
with pytest.raises(ValueError, match=r".*base_margin.*"):
|
||||
xgb.train({"tree_method": tm}, Xy)
|
||||
|
||||
# FIXME(jiamingy): Currently the metainfo has no concept of shape. If you pass a
|
||||
# base_margin with shape (n_classes, n_samples) to XGBoost the result is undefined.
|
||||
|
||||
|
||||
class TestDMatrix:
|
||||
def test_warn_missing(self):
|
||||
from xgboost import data
|
||||
@@ -122,7 +140,7 @@ class TestDMatrix:
|
||||
|
||||
# base margin is per-class in multi-class classifier
|
||||
base_margin = rng.randn(100, 3).astype(np.float32)
|
||||
d.set_base_margin(base_margin.flatten())
|
||||
d.set_base_margin(base_margin)
|
||||
|
||||
ridxs = [1, 2, 3, 4, 5, 6]
|
||||
sliced = d.slice(ridxs)
|
||||
@@ -380,3 +398,6 @@ class TestDMatrix:
|
||||
feature_types = ["q"] * 5 + ["c"] + ["q"] * 120
|
||||
Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types)
|
||||
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
|
||||
|
||||
def test_base_margin(self):
|
||||
set_base_margin_info(np.asarray, xgb.DMatrix, "hist")
|
||||
|
||||
@@ -7,7 +7,7 @@ import sys
|
||||
import numpy as np
|
||||
import scipy
|
||||
import json
|
||||
from typing import List, Tuple, Dict, Optional, Type, Any
|
||||
from typing import List, Tuple, Dict, Optional, Type, Any, Callable
|
||||
import asyncio
|
||||
from functools import partial
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
@@ -182,6 +182,50 @@ def test_dask_predict_shape_infer(client: "Client") -> None:
|
||||
assert prediction.shape[1] == 3
|
||||
|
||||
|
||||
def run_boost_from_prediction_multi_clasas(
|
||||
X: xgb.dask._DaskCollection,
|
||||
y: xgb.dask._DaskCollection,
|
||||
tree_method: str,
|
||||
client: "Client"
|
||||
) -> None:
|
||||
model_0 = xgb.dask.DaskXGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
|
||||
)
|
||||
model_0.fit(X=X, y=y)
|
||||
margin = xgb.dask.inplace_predict(
|
||||
client, model_0.get_booster(), X, predict_type="margin"
|
||||
)
|
||||
|
||||
model_1 = xgb.dask.DaskXGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
|
||||
)
|
||||
model_1.fit(X=X, y=y, base_margin=margin)
|
||||
predictions_1 = xgb.dask.predict(
|
||||
client,
|
||||
model_1.get_booster(),
|
||||
xgb.dask.DaskDMatrix(client, X, base_margin=margin),
|
||||
output_margin=True
|
||||
)
|
||||
|
||||
model_2 = xgb.dask.DaskXGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
|
||||
)
|
||||
model_2.fit(X=X, y=y)
|
||||
predictions_2 = xgb.dask.inplace_predict(
|
||||
client, model_2.get_booster(), X, predict_type="margin"
|
||||
)
|
||||
a = predictions_1.compute()
|
||||
b = predictions_2.compute()
|
||||
# cupy/cudf
|
||||
if hasattr(a, "get"):
|
||||
a = a.get()
|
||||
if hasattr(b, "values"):
|
||||
b = b.values
|
||||
if hasattr(b, "get"):
|
||||
b = b.get()
|
||||
np.testing.assert_allclose(a, b, atol=1e-5)
|
||||
|
||||
|
||||
def run_boost_from_prediction(
|
||||
X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client"
|
||||
) -> None:
|
||||
@@ -227,11 +271,15 @@ def run_boost_from_prediction(
|
||||
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
|
||||
def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.datasets import load_breast_cancer, load_digits
|
||||
X_, y_ = load_breast_cancer(return_X_y=True)
|
||||
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
|
||||
run_boost_from_prediction(X, y, tree_method, client)
|
||||
|
||||
X_, y_ = load_digits(return_X_y=True)
|
||||
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
|
||||
run_boost_from_prediction_multi_clasas(X, y, tree_method, client)
|
||||
|
||||
|
||||
def test_inplace_predict(client: "Client") -> None:
|
||||
from sklearn.datasets import load_boston
|
||||
|
||||
@@ -3,6 +3,7 @@ import numpy as np
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import pytest
|
||||
from test_dmatrix import set_base_margin_info
|
||||
|
||||
try:
|
||||
import modin.pandas as md
|
||||
@@ -144,3 +145,6 @@ class TestModin:
|
||||
assert data.num_col() == kCols
|
||||
|
||||
np.testing.assert_array_equal(data.get_weight(), w)
|
||||
|
||||
def test_base_margin(self):
|
||||
set_base_margin_info(md.DataFrame, xgb.DMatrix, "hist")
|
||||
|
||||
@@ -3,6 +3,7 @@ import numpy as np
|
||||
import xgboost as xgb
|
||||
import testing as tm
|
||||
import pytest
|
||||
from test_dmatrix import set_base_margin_info
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
@@ -205,6 +206,9 @@ class TestPandas:
|
||||
|
||||
np.testing.assert_array_equal(data.get_weight(), w)
|
||||
|
||||
def test_base_margin(self):
|
||||
set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist")
|
||||
|
||||
def test_cv_as_pandas(self):
|
||||
dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
|
||||
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from typing import Callable, Optional
|
||||
import collections
|
||||
import importlib.util
|
||||
import numpy as np
|
||||
@@ -1147,32 +1148,83 @@ def test_feature_weights():
|
||||
assert poly_decreasing[0] < -0.08
|
||||
|
||||
|
||||
def run_boost_from_prediction(tree_method):
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
def run_boost_from_prediction_binary(tree_method, X, y, as_frame: Optional[Callable]):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
|
||||
as_frame: A callable function to convert margin into DataFrame, useful for different
|
||||
df implementations.
|
||||
"""
|
||||
|
||||
model_0 = xgb.XGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=4,
|
||||
tree_method=tree_method)
|
||||
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
|
||||
)
|
||||
model_0.fit(X=X, y=y)
|
||||
margin = model_0.predict(X, output_margin=True)
|
||||
if as_frame is not None:
|
||||
margin = as_frame(margin)
|
||||
|
||||
model_1 = xgb.XGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=4,
|
||||
tree_method=tree_method)
|
||||
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
|
||||
)
|
||||
model_1.fit(X=X, y=y, base_margin=margin)
|
||||
predictions_1 = model_1.predict(X, base_margin=margin)
|
||||
|
||||
cls_2 = xgb.XGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=8,
|
||||
tree_method=tree_method)
|
||||
learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
|
||||
)
|
||||
cls_2.fit(X=X, y=y)
|
||||
predictions_2 = cls_2.predict(X)
|
||||
assert np.all(predictions_1 == predictions_2)
|
||||
np.testing.assert_allclose(predictions_1, predictions_2)
|
||||
|
||||
|
||||
def run_boost_from_prediction_multi_clasas(
|
||||
tree_method, X, y, as_frame: Optional[Callable]
|
||||
):
|
||||
# Multi-class
|
||||
model_0 = xgb.XGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
|
||||
)
|
||||
model_0.fit(X=X, y=y)
|
||||
margin = model_0.get_booster().inplace_predict(X, predict_type="margin")
|
||||
if as_frame is not None:
|
||||
margin = as_frame(margin)
|
||||
|
||||
model_1 = xgb.XGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
|
||||
)
|
||||
model_1.fit(X=X, y=y, base_margin=margin)
|
||||
predictions_1 = model_1.get_booster().predict(
|
||||
xgb.DMatrix(X, base_margin=margin), output_margin=True
|
||||
)
|
||||
|
||||
model_2 = xgb.XGBClassifier(
|
||||
learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
|
||||
)
|
||||
model_2.fit(X=X, y=y)
|
||||
predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin")
|
||||
|
||||
if hasattr(predictions_1, "get"):
|
||||
predictions_1 = predictions_1.get()
|
||||
if hasattr(predictions_2, "get"):
|
||||
predictions_2 = predictions_2.get()
|
||||
np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
|
||||
def test_boost_from_prediction(tree_method):
|
||||
run_boost_from_prediction(tree_method)
|
||||
from sklearn.datasets import load_breast_cancer, load_digits
|
||||
import pandas as pd
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
|
||||
run_boost_from_prediction_binary(tree_method, X, y, None)
|
||||
run_boost_from_prediction_binary(tree_method, X, y, pd.DataFrame)
|
||||
|
||||
X, y = load_digits(return_X_y=True)
|
||||
|
||||
run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
|
||||
run_boost_from_prediction_multi_clasas(tree_method, X, y, pd.DataFrame)
|
||||
|
||||
|
||||
def test_estimator_type():
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
import urllib
|
||||
import zipfile
|
||||
import sys
|
||||
from typing import Optional
|
||||
from contextlib import contextmanager
|
||||
from io import StringIO
|
||||
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
|
||||
@@ -177,7 +178,7 @@ class TestDataset:
|
||||
self.metric = metric
|
||||
self.X, self.y = get_dataset()
|
||||
self.w = None
|
||||
self.margin = None
|
||||
self.margin: Optional[np.ndarray] = None
|
||||
|
||||
def set_params(self, params_in):
|
||||
params_in['objective'] = self.objective
|
||||
@@ -315,7 +316,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(
|
||||
|
||||
@strategies.composite
|
||||
def _dataset_weight_margin(draw):
|
||||
data = draw(_unweighted_datasets_strategy)
|
||||
data: TestDataset = draw(_unweighted_datasets_strategy)
|
||||
if draw(strategies.booleans()):
|
||||
data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0)))
|
||||
if draw(strategies.booleans()):
|
||||
@@ -324,6 +325,8 @@ def _dataset_weight_margin(draw):
|
||||
num_class = int(np.max(data.y) + 1)
|
||||
data.margin = draw(
|
||||
arrays(np.float64, (len(data.y) * num_class), elements=strategies.floats(0.5, 1.0)))
|
||||
if num_class != 1:
|
||||
data.margin = data.margin.reshape(data.y.shape[0], num_class)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
Reference in New Issue
Block a user