Support multi-class with base margin. (#7381)

This is already partially supported but never properly tested. So the only possible way to use it is calling `numpy.ndarray.flatten` with `base_margin` before passing it into XGBoost. This PR adds proper support
for most of the data types along with tests.
This commit is contained in:
Jiaming Yuan
2021-11-02 13:38:00 +08:00
committed by GitHub
parent 6295dc3b67
commit a13321148a
18 changed files with 274 additions and 92 deletions

View File

@@ -252,6 +252,8 @@ TEST(MetaInfo, Validate) {
EXPECT_THROW(info.Validate(1), dmlc::Error);
xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
d_groups.SetDevice(0);
d_groups.DevicePointer(); // pull to device
auto arr_interface = xgboost::GetArrayInterface(&d_groups, 64, 1);
std::string arr_interface_str;
xgboost::Json::Dump(arr_interface, &arr_interface_str);

View File

@@ -5,6 +5,7 @@ import pytest
sys.path.append("tests/python")
import testing as tm
from test_dmatrix import set_base_margin_info
def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
@@ -142,6 +143,8 @@ def _test_cudf_metainfo(DMatrixT):
dmat_cudf.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
set_base_margin_info(df, DMatrixT, "gpu_hist")
class TestFromColumnar:
'''Tests for constructing DMatrix from data structure conforming Apache

View File

@@ -5,6 +5,7 @@ import pytest
sys.path.append("tests/python")
import testing as tm
from test_dmatrix import set_base_margin_info
def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN):
@@ -107,6 +108,8 @@ def _test_cupy_metainfo(DMatrixT):
assert np.array_equal(dmat.get_uint_info('group_ptr'),
dmat_cupy.get_uint_info('group_ptr'))
set_base_margin_info(cp.asarray, DMatrixT, "gpu_hist")
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.skipif(**tm.no_sklearn())

View File

@@ -22,6 +22,7 @@ from test_with_dask import run_empty_dmatrix_reg # noqa
from test_with_dask import run_empty_dmatrix_auc # noqa
from test_with_dask import run_auc # noqa
from test_with_dask import run_boost_from_prediction # noqa
from test_with_dask import run_boost_from_prediction_multi_clasas # noqa
from test_with_dask import run_dask_classifier # noqa
from test_with_dask import run_empty_dmatrix_cls # noqa
from test_with_dask import _get_client_workers # noqa
@@ -297,13 +298,18 @@ def run_gpu_hist(
@pytest.mark.skipif(**tm.no_cudf())
def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None:
import cudf
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_breast_cancer, load_digits
with Client(local_cuda_cluster) as client:
X_, y_ = load_breast_cancer(return_X_y=True)
X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
run_boost_from_prediction(X, y, "gpu_hist", client)
X_, y_ = load_digits(return_X_y=True)
X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
run_boost_from_prediction_multi_clasas(X, y, "gpu_hist", client)
class TestDistributedGPU:
@pytest.mark.skipif(**tm.no_dask())

View File

@@ -35,8 +35,25 @@ def test_gpu_binary_classification():
assert err < 0.1
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.skipif(**tm.no_cudf())
def test_boost_from_prediction_gpu_hist():
twskl.run_boost_from_prediction('gpu_hist')
from sklearn.datasets import load_breast_cancer, load_digits
import cupy as cp
import cudf
tree_method = "gpu_hist"
X, y = load_breast_cancer(return_X_y=True)
X, y = cp.array(X), cp.array(y)
twskl.run_boost_from_prediction_binary(tree_method, X, y, None)
twskl.run_boost_from_prediction_binary(tree_method, X, y, cudf.DataFrame)
X, y = load_digits(return_X_y=True)
X, y = cp.array(X), cp.array(y)
twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, cudf.DataFrame)
def test_num_parallel_tree():

View File

@@ -15,6 +15,24 @@ dpath = 'demo/data/'
rng = np.random.RandomState(1994)
def set_base_margin_info(DType, DMatrixT, tm: str):
rng = np.random.default_rng()
X = DType(rng.normal(0, 1.0, size=100).reshape(50, 2))
if hasattr(X, "iloc"):
y = X.iloc[:, 0]
else:
y = X[:, 0]
base_margin = X
# no error at set
Xy = DMatrixT(X, y, base_margin=base_margin)
# Error at train, caused by check in predictor.
with pytest.raises(ValueError, match=r".*base_margin.*"):
xgb.train({"tree_method": tm}, Xy)
# FIXME(jiamingy): Currently the metainfo has no concept of shape. If you pass a
# base_margin with shape (n_classes, n_samples) to XGBoost the result is undefined.
class TestDMatrix:
def test_warn_missing(self):
from xgboost import data
@@ -122,7 +140,7 @@ class TestDMatrix:
# base margin is per-class in multi-class classifier
base_margin = rng.randn(100, 3).astype(np.float32)
d.set_base_margin(base_margin.flatten())
d.set_base_margin(base_margin)
ridxs = [1, 2, 3, 4, 5, 6]
sliced = d.slice(ridxs)
@@ -380,3 +398,6 @@ class TestDMatrix:
feature_types = ["q"] * 5 + ["c"] + ["q"] * 120
Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types)
np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
def test_base_margin(self):
set_base_margin_info(np.asarray, xgb.DMatrix, "hist")

View File

@@ -7,7 +7,7 @@ import sys
import numpy as np
import scipy
import json
from typing import List, Tuple, Dict, Optional, Type, Any
from typing import List, Tuple, Dict, Optional, Type, Any, Callable
import asyncio
from functools import partial
from concurrent.futures import ThreadPoolExecutor
@@ -182,6 +182,50 @@ def test_dask_predict_shape_infer(client: "Client") -> None:
assert prediction.shape[1] == 3
def run_boost_from_prediction_multi_clasas(
X: xgb.dask._DaskCollection,
y: xgb.dask._DaskCollection,
tree_method: str,
client: "Client"
) -> None:
model_0 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_0.fit(X=X, y=y)
margin = xgb.dask.inplace_predict(
client, model_0.get_booster(), X, predict_type="margin"
)
model_1 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_1.fit(X=X, y=y, base_margin=margin)
predictions_1 = xgb.dask.predict(
client,
model_1.get_booster(),
xgb.dask.DaskDMatrix(client, X, base_margin=margin),
output_margin=True
)
model_2 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
)
model_2.fit(X=X, y=y)
predictions_2 = xgb.dask.inplace_predict(
client, model_2.get_booster(), X, predict_type="margin"
)
a = predictions_1.compute()
b = predictions_2.compute()
# cupy/cudf
if hasattr(a, "get"):
a = a.get()
if hasattr(b, "values"):
b = b.values
if hasattr(b, "get"):
b = b.get()
np.testing.assert_allclose(a, b, atol=1e-5)
def run_boost_from_prediction(
X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client"
) -> None:
@@ -227,11 +271,15 @@ def run_boost_from_prediction(
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_breast_cancer, load_digits
X_, y_ = load_breast_cancer(return_X_y=True)
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
run_boost_from_prediction(X, y, tree_method, client)
X_, y_ = load_digits(return_X_y=True)
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
run_boost_from_prediction_multi_clasas(X, y, tree_method, client)
def test_inplace_predict(client: "Client") -> None:
from sklearn.datasets import load_boston

View File

@@ -3,6 +3,7 @@ import numpy as np
import xgboost as xgb
import testing as tm
import pytest
from test_dmatrix import set_base_margin_info
try:
import modin.pandas as md
@@ -144,3 +145,6 @@ class TestModin:
assert data.num_col() == kCols
np.testing.assert_array_equal(data.get_weight(), w)
def test_base_margin(self):
set_base_margin_info(md.DataFrame, xgb.DMatrix, "hist")

View File

@@ -3,6 +3,7 @@ import numpy as np
import xgboost as xgb
import testing as tm
import pytest
from test_dmatrix import set_base_margin_info
try:
import pandas as pd
@@ -205,6 +206,9 @@ class TestPandas:
np.testing.assert_array_equal(data.get_weight(), w)
def test_base_margin(self):
set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist")
def test_cv_as_pandas(self):
dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,

View File

@@ -1,3 +1,4 @@
from typing import Callable, Optional
import collections
import importlib.util
import numpy as np
@@ -1147,32 +1148,83 @@ def test_feature_weights():
assert poly_decreasing[0] < -0.08
def run_boost_from_prediction(tree_method):
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
def run_boost_from_prediction_binary(tree_method, X, y, as_frame: Optional[Callable]):
"""
Parameters
----------
as_frame: A callable function to convert margin into DataFrame, useful for different
df implementations.
"""
model_0 = xgb.XGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4,
tree_method=tree_method)
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_0.fit(X=X, y=y)
margin = model_0.predict(X, output_margin=True)
if as_frame is not None:
margin = as_frame(margin)
model_1 = xgb.XGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4,
tree_method=tree_method)
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_1.fit(X=X, y=y, base_margin=margin)
predictions_1 = model_1.predict(X, base_margin=margin)
cls_2 = xgb.XGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=8,
tree_method=tree_method)
learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
)
cls_2.fit(X=X, y=y)
predictions_2 = cls_2.predict(X)
assert np.all(predictions_1 == predictions_2)
np.testing.assert_allclose(predictions_1, predictions_2)
def run_boost_from_prediction_multi_clasas(
tree_method, X, y, as_frame: Optional[Callable]
):
# Multi-class
model_0 = xgb.XGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_0.fit(X=X, y=y)
margin = model_0.get_booster().inplace_predict(X, predict_type="margin")
if as_frame is not None:
margin = as_frame(margin)
model_1 = xgb.XGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_1.fit(X=X, y=y, base_margin=margin)
predictions_1 = model_1.get_booster().predict(
xgb.DMatrix(X, base_margin=margin), output_margin=True
)
model_2 = xgb.XGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
)
model_2.fit(X=X, y=y)
predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin")
if hasattr(predictions_1, "get"):
predictions_1 = predictions_1.get()
if hasattr(predictions_2, "get"):
predictions_2 = predictions_2.get()
np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6)
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
def test_boost_from_prediction(tree_method):
run_boost_from_prediction(tree_method)
from sklearn.datasets import load_breast_cancer, load_digits
import pandas as pd
X, y = load_breast_cancer(return_X_y=True)
run_boost_from_prediction_binary(tree_method, X, y, None)
run_boost_from_prediction_binary(tree_method, X, y, pd.DataFrame)
X, y = load_digits(return_X_y=True)
run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
run_boost_from_prediction_multi_clasas(tree_method, X, y, pd.DataFrame)
def test_estimator_type():

View File

@@ -3,6 +3,7 @@ import os
import urllib
import zipfile
import sys
from typing import Optional
from contextlib import contextmanager
from io import StringIO
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
@@ -177,7 +178,7 @@ class TestDataset:
self.metric = metric
self.X, self.y = get_dataset()
self.w = None
self.margin = None
self.margin: Optional[np.ndarray] = None
def set_params(self, params_in):
params_in['objective'] = self.objective
@@ -315,7 +316,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(
@strategies.composite
def _dataset_weight_margin(draw):
data = draw(_unweighted_datasets_strategy)
data: TestDataset = draw(_unweighted_datasets_strategy)
if draw(strategies.booleans()):
data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0)))
if draw(strategies.booleans()):
@@ -324,6 +325,8 @@ def _dataset_weight_margin(draw):
num_class = int(np.max(data.y) + 1)
data.margin = draw(
arrays(np.float64, (len(data.y) * num_class), elements=strategies.floats(0.5, 1.0)))
if num_class != 1:
data.margin = data.margin.reshape(data.y.shape[0], num_class)
return data