Support multi-class with base margin. (#7381)

This is already partially supported but never properly tested. So the only possible way to use it is calling `numpy.ndarray.flatten` with `base_margin` before passing it into XGBoost. This PR adds proper support for most of the data types along with tests.
2021-11-02 13:38:00 +08:00
parent 6295dc3b67
commit a13321148a
18 changed files with 274 additions and 92 deletions
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -7,7 +7,7 @@ import sys
 import numpy as np
 import scipy
 import json
-from typing import List, Tuple, Dict, Optional, Type, Any
+from typing import List, Tuple, Dict, Optional, Type, Any, Callable
 import asyncio
 from functools import partial
 from concurrent.futures import ThreadPoolExecutor
@@ -182,6 +182,50 @@ def test_dask_predict_shape_infer(client: "Client") -> None:
    assert prediction.shape[1] == 3


+def run_boost_from_prediction_multi_clasas(
+    X: xgb.dask._DaskCollection,
+    y: xgb.dask._DaskCollection,
+    tree_method: str,
+    client: "Client"
+) -> None:
+    model_0 = xgb.dask.DaskXGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+    )
+    model_0.fit(X=X, y=y)
+    margin = xgb.dask.inplace_predict(
+        client, model_0.get_booster(), X, predict_type="margin"
+    )
+
+    model_1 = xgb.dask.DaskXGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+    )
+    model_1.fit(X=X, y=y, base_margin=margin)
+    predictions_1 = xgb.dask.predict(
+        client,
+        model_1.get_booster(),
+        xgb.dask.DaskDMatrix(client, X, base_margin=margin),
+        output_margin=True
+    )
+
+    model_2 = xgb.dask.DaskXGBClassifier(
+        learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
+    )
+    model_2.fit(X=X, y=y)
+    predictions_2 = xgb.dask.inplace_predict(
+        client, model_2.get_booster(), X, predict_type="margin"
+    )
+    a = predictions_1.compute()
+    b = predictions_2.compute()
+    # cupy/cudf
+    if hasattr(a, "get"):
+        a = a.get()
+    if hasattr(b, "values"):
+        b = b.values
+    if hasattr(b, "get"):
+        b = b.get()
+    np.testing.assert_allclose(a, b, atol=1e-5)
+
+
 def run_boost_from_prediction(
    X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client"
 ) -> None:
@@ -227,11 +271,15 @@ def run_boost_from_prediction(

@pytest.mark.parametrize("tree_method", ["hist", "approx"])
 def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
-    from sklearn.datasets import load_breast_cancer
+    from sklearn.datasets import load_breast_cancer, load_digits
    X_, y_ = load_breast_cancer(return_X_y=True)
    X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
    run_boost_from_prediction(X, y, tree_method, client)

+    X_, y_ = load_digits(return_X_y=True)
+    X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
+    run_boost_from_prediction_multi_clasas(X, y, tree_method, client)
+

 def test_inplace_predict(client: "Client") -> None:
    from sklearn.datasets import load_boston