Support multi-class with base margin. (#7381)

This is already partially supported but never properly tested. So the only possible way to use it is calling `numpy.ndarray.flatten` with `base_margin` before passing it into XGBoost. This PR adds proper support for most of the data types along with tests.
2021-11-02 13:38:00 +08:00
parent 6295dc3b67
commit a13321148a
18 changed files with 274 additions and 92 deletions
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -252,6 +252,8 @@ TEST(MetaInfo, Validate) {
  EXPECT_THROW(info.Validate(1), dmlc::Error);

  xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
+  d_groups.SetDevice(0);
+  d_groups.DevicePointer();  // pull to device
  auto arr_interface = xgboost::GetArrayInterface(&d_groups, 64, 1);
  std::string arr_interface_str;
  xgboost::Json::Dump(arr_interface, &arr_interface_str);
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -5,6 +5,7 @@ import pytest

 sys.path.append("tests/python")
 import testing as tm
+from test_dmatrix import set_base_margin_info


 def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
@@ -142,6 +143,8 @@ def _test_cudf_metainfo(DMatrixT):
                          dmat_cudf.get_float_info('base_margin'))
    assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))

+    set_base_margin_info(df, DMatrixT, "gpu_hist")
+

 class TestFromColumnar:
    '''Tests for constructing DMatrix from data structure conforming Apache
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -5,6 +5,7 @@ import pytest

 sys.path.append("tests/python")
 import testing as tm
+from test_dmatrix import set_base_margin_info


 def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN):
@@ -107,6 +108,8 @@ def _test_cupy_metainfo(DMatrixT):
    assert np.array_equal(dmat.get_uint_info('group_ptr'),
                          dmat_cupy.get_uint_info('group_ptr'))

+    set_base_margin_info(cp.asarray, DMatrixT, "gpu_hist")
+

@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.skipif(**tm.no_sklearn())
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -22,6 +22,7 @@ from test_with_dask import run_empty_dmatrix_reg      # noqa
 from test_with_dask import run_empty_dmatrix_auc      # noqa
 from test_with_dask import run_auc                    # noqa
 from test_with_dask import run_boost_from_prediction  # noqa
+from test_with_dask import run_boost_from_prediction_multi_clasas  # noqa
 from test_with_dask import run_dask_classifier        # noqa
 from test_with_dask import run_empty_dmatrix_cls      # noqa
 from test_with_dask import _get_client_workers        # noqa
@@ -297,13 +298,18 @@ def run_gpu_hist(
@pytest.mark.skipif(**tm.no_cudf())
 def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None:
    import cudf
-    from sklearn.datasets import load_breast_cancer
+    from sklearn.datasets import load_breast_cancer, load_digits
    with Client(local_cuda_cluster) as client:
        X_, y_ = load_breast_cancer(return_X_y=True)
        X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
        y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
        run_boost_from_prediction(X, y, "gpu_hist", client)

+        X_, y_ = load_digits(return_X_y=True)
+        X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
+        y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
+        run_boost_from_prediction_multi_clasas(X, y, "gpu_hist", client)
+

 class TestDistributedGPU:
    @pytest.mark.skipif(**tm.no_dask())
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -35,8 +35,25 @@ def test_gpu_binary_classification():
            assert err < 0.1


+@pytest.mark.skipif(**tm.no_cupy())
+@pytest.mark.skipif(**tm.no_cudf())
 def test_boost_from_prediction_gpu_hist():
-    twskl.run_boost_from_prediction('gpu_hist')
+    from sklearn.datasets import load_breast_cancer, load_digits
+    import cupy as cp
+    import cudf
+
+    tree_method = "gpu_hist"
+    X, y = load_breast_cancer(return_X_y=True)
+    X, y = cp.array(X), cp.array(y)
+
+    twskl.run_boost_from_prediction_binary(tree_method, X, y, None)
+    twskl.run_boost_from_prediction_binary(tree_method, X, y, cudf.DataFrame)
+
+    X, y = load_digits(return_X_y=True)
+    X, y = cp.array(X), cp.array(y)
+
+    twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
+    twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, cudf.DataFrame)


 def test_num_parallel_tree():
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -15,6 +15,24 @@ dpath = 'demo/data/'
 rng = np.random.RandomState(1994)


+def set_base_margin_info(DType, DMatrixT, tm: str):
+    rng = np.random.default_rng()
+    X = DType(rng.normal(0, 1.0, size=100).reshape(50, 2))
+    if hasattr(X, "iloc"):
+        y = X.iloc[:, 0]
+    else:
+        y = X[:, 0]
+    base_margin = X
+    # no error at set
+    Xy = DMatrixT(X, y, base_margin=base_margin)
+    # Error at train, caused by check in predictor.
+    with pytest.raises(ValueError, match=r".*base_margin.*"):
+        xgb.train({"tree_method": tm}, Xy)
+
+    # FIXME(jiamingy): Currently the metainfo has no concept of shape.  If you pass a
+    # base_margin with shape (n_classes, n_samples) to XGBoost the result is undefined.
+
+
 class TestDMatrix:
    def test_warn_missing(self):
        from xgboost import data
@@ -122,7 +140,7 @@ class TestDMatrix:

        # base margin is per-class in multi-class classifier
        base_margin = rng.randn(100, 3).astype(np.float32)
-        d.set_base_margin(base_margin.flatten())
+        d.set_base_margin(base_margin)

        ridxs = [1, 2, 3, 4, 5, 6]
        sliced = d.slice(ridxs)
@@ -380,3 +398,6 @@ class TestDMatrix:
        feature_types = ["q"] * 5 + ["c"] + ["q"] * 120
        Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types)
        np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
+
+    def test_base_margin(self):
+        set_base_margin_info(np.asarray, xgb.DMatrix, "hist")
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -7,7 +7,7 @@ import sys
 import numpy as np
 import scipy
 import json
-from typing import List, Tuple, Dict, Optional, Type, Any
+from typing import List, Tuple, Dict, Optional, Type, Any, Callable
 import asyncio
 from functools import partial
 from concurrent.futures import ThreadPoolExecutor
@@ -182,6 +182,50 @@ def test_dask_predict_shape_infer(client: "Client") -> None:
    assert prediction.shape[1] == 3


+def run_boost_from_prediction_multi_clasas(
+    X: xgb.dask._DaskCollection,
+    y: xgb.dask._DaskCollection,
+    tree_method: str,
+    client: "Client"
+) -> None:
+    model_0 = xgb.dask.DaskXGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+    )
+    model_0.fit(X=X, y=y)
+    margin = xgb.dask.inplace_predict(
+        client, model_0.get_booster(), X, predict_type="margin"
+    )
+
+    model_1 = xgb.dask.DaskXGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+    )
+    model_1.fit(X=X, y=y, base_margin=margin)
+    predictions_1 = xgb.dask.predict(
+        client,
+        model_1.get_booster(),
+        xgb.dask.DaskDMatrix(client, X, base_margin=margin),
+        output_margin=True
+    )
+
+    model_2 = xgb.dask.DaskXGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
+    )
+    model_2.fit(X=X, y=y)
+    predictions_2 = xgb.dask.inplace_predict(
+        client, model_2.get_booster(), X, predict_type="margin"
+    )
+    a = predictions_1.compute()
+    b = predictions_2.compute()
+    # cupy/cudf
+    if hasattr(a, "get"):
+        a = a.get()
+    if hasattr(b, "values"):
+        b = b.values
+    if hasattr(b, "get"):
+        b = b.get()
+    np.testing.assert_allclose(a, b, atol=1e-5)
+
+
 def run_boost_from_prediction(
    X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client"
 ) -> None:
@@ -227,11 +271,15 @@ def run_boost_from_prediction(

@pytest.mark.parametrize("tree_method", ["hist", "approx"])
 def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
-    from sklearn.datasets import load_breast_cancer
+    from sklearn.datasets import load_breast_cancer, load_digits
    X_, y_ = load_breast_cancer(return_X_y=True)
    X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
    run_boost_from_prediction(X, y, tree_method, client)

+    X_, y_ = load_digits(return_X_y=True)
+    X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
+    run_boost_from_prediction_multi_clasas(X, y, tree_method, client)
+

 def test_inplace_predict(client: "Client") -> None:
    from sklearn.datasets import load_boston
--- a/tests/python/test_with_modin.py
+++ b/tests/python/test_with_modin.py
@@ -3,6 +3,7 @@ import numpy as np
 import xgboost as xgb
 import testing as tm
 import pytest
+from test_dmatrix import set_base_margin_info

 try:
    import modin.pandas as md
@@ -144,3 +145,6 @@ class TestModin:
        assert data.num_col() == kCols

        np.testing.assert_array_equal(data.get_weight(), w)
+
+    def test_base_margin(self):
+        set_base_margin_info(md.DataFrame, xgb.DMatrix, "hist")
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -3,6 +3,7 @@ import numpy as np
 import xgboost as xgb
 import testing as tm
 import pytest
+from test_dmatrix import set_base_margin_info

 try:
    import pandas as pd
@@ -205,6 +206,9 @@ class TestPandas:

        np.testing.assert_array_equal(data.get_weight(), w)

+    def test_base_margin(self):
+        set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist")
+
    def test_cv_as_pandas(self):
        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
        params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1,3 +1,4 @@
+from typing import Callable, Optional
 import collections
 import importlib.util
 import numpy as np
@@ -1147,32 +1148,83 @@ def test_feature_weights():
    assert poly_decreasing[0] < -0.08


-def run_boost_from_prediction(tree_method):
-    from sklearn.datasets import load_breast_cancer
-    X, y = load_breast_cancer(return_X_y=True)
+def run_boost_from_prediction_binary(tree_method, X, y, as_frame: Optional[Callable]):
+    """
+    Parameters
+    ----------
+
+    as_frame: A callable function to convert margin into DataFrame, useful for different
+    df implementations.
+    """
+
    model_0 = xgb.XGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=4,
-        tree_method=tree_method)
+        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+    )
    model_0.fit(X=X, y=y)
    margin = model_0.predict(X, output_margin=True)
+    if as_frame is not None:
+        margin = as_frame(margin)

    model_1 = xgb.XGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=4,
-        tree_method=tree_method)
+        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+    )
    model_1.fit(X=X, y=y, base_margin=margin)
    predictions_1 = model_1.predict(X, base_margin=margin)

    cls_2 = xgb.XGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=8,
-        tree_method=tree_method)
+        learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
+    )
    cls_2.fit(X=X, y=y)
    predictions_2 = cls_2.predict(X)
-    assert np.all(predictions_1 == predictions_2)
+    np.testing.assert_allclose(predictions_1, predictions_2)
+
+
+def run_boost_from_prediction_multi_clasas(
+    tree_method, X, y, as_frame: Optional[Callable]
+):
+    # Multi-class
+    model_0 = xgb.XGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+    )
+    model_0.fit(X=X, y=y)
+    margin = model_0.get_booster().inplace_predict(X, predict_type="margin")
+    if as_frame is not None:
+        margin = as_frame(margin)
+
+    model_1 = xgb.XGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+    )
+    model_1.fit(X=X, y=y, base_margin=margin)
+    predictions_1 = model_1.get_booster().predict(
+        xgb.DMatrix(X, base_margin=margin), output_margin=True
+    )
+
+    model_2 = xgb.XGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
+    )
+    model_2.fit(X=X, y=y)
+    predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin")
+
+    if hasattr(predictions_1, "get"):
+        predictions_1 = predictions_1.get()
+    if hasattr(predictions_2, "get"):
+        predictions_2 = predictions_2.get()
+    np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6)


@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
 def test_boost_from_prediction(tree_method):
-    run_boost_from_prediction(tree_method)
+    from sklearn.datasets import load_breast_cancer, load_digits
+    import pandas as pd
+    X, y = load_breast_cancer(return_X_y=True)
+
+    run_boost_from_prediction_binary(tree_method, X, y, None)
+    run_boost_from_prediction_binary(tree_method, X, y, pd.DataFrame)
+
+    X, y = load_digits(return_X_y=True)
+
+    run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
+    run_boost_from_prediction_multi_clasas(tree_method, X, y, pd.DataFrame)


 def test_estimator_type():
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -3,6 +3,7 @@ import os
 import urllib
 import zipfile
 import sys
+from typing import Optional
 from contextlib import contextmanager
 from io import StringIO
 from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
@@ -177,7 +178,7 @@ class TestDataset:
        self.metric = metric
        self.X, self.y = get_dataset()
        self.w = None
-        self.margin = None
+        self.margin: Optional[np.ndarray] = None

    def set_params(self, params_in):
        params_in['objective'] = self.objective
@@ -315,7 +316,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(

@strategies.composite
 def _dataset_weight_margin(draw):
-    data = draw(_unweighted_datasets_strategy)
+    data: TestDataset = draw(_unweighted_datasets_strategy)
    if draw(strategies.booleans()):
        data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0)))
    if draw(strategies.booleans()):
@@ -324,6 +325,8 @@ def _dataset_weight_margin(draw):
            num_class = int(np.max(data.y) + 1)
        data.margin = draw(
            arrays(np.float64, (len(data.y) * num_class), elements=strategies.floats(0.5, 1.0)))
+        if num_class != 1:
+            data.margin = data.margin.reshape(data.y.shape[0], num_class)

    return data