Initial support for multi-label classification. (#7521)

* Add support in sklearn classifier.
2022-01-04 23:58:21 +08:00
parent 68cdbc9c16
commit 8f0a42a266
4 changed files with 70 additions and 2 deletions
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -27,3 +27,4 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
  external_memory
  custom_metric_obj
  categorical
+  multioutput
--- a/doc/tutorials/multioutput.rst
+++ b/doc/tutorials/multioutput.rst
@@ -0,0 +1,37 @@
+################
+Multiple Outputs
+################
+
+.. versionadded:: 1.6
+
+Starting from version 1.6, XGBoost has experimental support for multi-output regression
+and multi-label classification with Python package.  Multi-label classification usually
+refers to targets that have multiple non-exclusive class labels.  For instance, a movie
+can be simultaneously classified as both sci-fi and comedy.  For detailed explanation of
+terminologies related to different multi-output models please refer to the `scikit-learn
+user guide <https://scikit-learn.org/stable/modules/multiclass.HTML>`_.
+
+Internally, XGBoost builds one model for each target similar to sklearn meta estimators,
+with the added benefit of reusing data and custom objective support.  For a worked example
+of regression, see :ref:`sphx_glr_python_examples_multioutput_regression.py`. For
+multi-label classification, the binary relevance strategy is used.  Input ``y`` should be
+of shape ``(n_samples, n_classes)`` with each column having a value of 0 or 1 to specify
+whether the sample is labeled as positive for respective class. Given a sample with 3
+output classes and 2 labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with
+the second class labeled as negative and the rest labeled as positive. At the moment
+XGBoost supports only dense matrix for labels.
+
+.. code-block:: python
+
+    from sklearn.datasets import make_multilabel_classification
+    import numpy as np
+
+    X, y = make_multilabel_classification(
+        n_samples=32, n_classes=5, n_labels=3, random_state=0
+    )
+    clf = xgb.XGBClassifier(tree_method="hist")
+    clf.fit(X, y)
+    np.testing.assert_allclose(clf.predict(X), y)
+
+
+The feature is still under development with limited support from objectives and metrics.
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1215,6 +1215,14 @@ PredtT = TypeVar("PredtT", bound=np.ndarray)
 def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) -> PredtT:
    assert len(prediction.shape) <= 2
    if len(prediction.shape) == 2 and prediction.shape[1] == n_classes:
+        # multi-class
+        return prediction
+    if (
+        len(prediction.shape) == 2
+        and n_classes == 2
+        and prediction.shape[1] >= n_classes
+    ):
+        # multi-label
        return prediction
    # binary logistic function
    classone_probs = prediction
@@ -1374,9 +1382,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
            # If output_margin is active, simply return the scores
            return class_probs

-        if len(class_probs.shape) > 1:
-            # turns softprob into softmax
+        if len(class_probs.shape) > 1 and self.n_classes_ != 2:
+            # multi-class, turns softprob into softmax
            column_indexes: np.ndarray = np.argmax(class_probs, axis=1)  # type: ignore
+        elif len(class_probs.shape) > 1 and class_probs.shape[1] != 1:
+            # multi-label
+            column_indexes = np.zeros(class_probs.shape)
+            column_indexes[class_probs > 0.5] = 1
        else:
            # turns soft logit into class label
            column_indexes = np.repeat(0, class_probs.shape[0])
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1194,6 +1194,24 @@ def test_estimator_type():
        cls.load_model(path)  # no error


+def test_multilabel_classification() -> None:
+    from sklearn.datasets import make_multilabel_classification
+
+    X, y = make_multilabel_classification(
+        n_samples=32, n_classes=5, n_labels=3, random_state=0
+    )
+    clf = xgb.XGBClassifier(tree_method="hist")
+    clf.fit(X, y)
+    booster = clf.get_booster()
+    learner = json.loads(booster.save_config())["learner"]
+    assert int(learner["learner_model_param"]["num_target"]) == 5
+
+    np.testing.assert_allclose(clf.predict(X), y)
+    predt = (clf.predict_proba(X) > 0.5).astype(np.int64)
+    np.testing.assert_allclose(clf.predict(X), predt)
+    assert predt.dtype == np.int64
+
+
 def run_data_initialization(DMatrix, model, X, y):
    """Assert that we don't create duplicated DMatrix."""