Initial support for multi-label classification. (#7521)
* Add support in sklearn classifier.
This commit is contained in:
parent
68cdbc9c16
commit
8f0a42a266
@ -27,3 +27,4 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
|
|||||||
external_memory
|
external_memory
|
||||||
custom_metric_obj
|
custom_metric_obj
|
||||||
categorical
|
categorical
|
||||||
|
multioutput
|
||||||
|
|||||||
37
doc/tutorials/multioutput.rst
Normal file
37
doc/tutorials/multioutput.rst
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
################
|
||||||
|
Multiple Outputs
|
||||||
|
################
|
||||||
|
|
||||||
|
.. versionadded:: 1.6
|
||||||
|
|
||||||
|
Starting from version 1.6, XGBoost has experimental support for multi-output regression
|
||||||
|
and multi-label classification with Python package. Multi-label classification usually
|
||||||
|
refers to targets that have multiple non-exclusive class labels. For instance, a movie
|
||||||
|
can be simultaneously classified as both sci-fi and comedy. For detailed explanation of
|
||||||
|
terminologies related to different multi-output models please refer to the `scikit-learn
|
||||||
|
user guide <https://scikit-learn.org/stable/modules/multiclass.HTML>`_.
|
||||||
|
|
||||||
|
Internally, XGBoost builds one model for each target similar to sklearn meta estimators,
|
||||||
|
with the added benefit of reusing data and custom objective support. For a worked example
|
||||||
|
of regression, see :ref:`sphx_glr_python_examples_multioutput_regression.py`. For
|
||||||
|
multi-label classification, the binary relevance strategy is used. Input ``y`` should be
|
||||||
|
of shape ``(n_samples, n_classes)`` with each column having a value of 0 or 1 to specify
|
||||||
|
whether the sample is labeled as positive for respective class. Given a sample with 3
|
||||||
|
output classes and 2 labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with
|
||||||
|
the second class labeled as negative and the rest labeled as positive. At the moment
|
||||||
|
XGBoost supports only dense matrix for labels.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from sklearn.datasets import make_multilabel_classification
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
X, y = make_multilabel_classification(
|
||||||
|
n_samples=32, n_classes=5, n_labels=3, random_state=0
|
||||||
|
)
|
||||||
|
clf = xgb.XGBClassifier(tree_method="hist")
|
||||||
|
clf.fit(X, y)
|
||||||
|
np.testing.assert_allclose(clf.predict(X), y)
|
||||||
|
|
||||||
|
|
||||||
|
The feature is still under development with limited support from objectives and metrics.
|
||||||
@ -1215,6 +1215,14 @@ PredtT = TypeVar("PredtT", bound=np.ndarray)
|
|||||||
def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) -> PredtT:
|
def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) -> PredtT:
|
||||||
assert len(prediction.shape) <= 2
|
assert len(prediction.shape) <= 2
|
||||||
if len(prediction.shape) == 2 and prediction.shape[1] == n_classes:
|
if len(prediction.shape) == 2 and prediction.shape[1] == n_classes:
|
||||||
|
# multi-class
|
||||||
|
return prediction
|
||||||
|
if (
|
||||||
|
len(prediction.shape) == 2
|
||||||
|
and n_classes == 2
|
||||||
|
and prediction.shape[1] >= n_classes
|
||||||
|
):
|
||||||
|
# multi-label
|
||||||
return prediction
|
return prediction
|
||||||
# binary logistic function
|
# binary logistic function
|
||||||
classone_probs = prediction
|
classone_probs = prediction
|
||||||
@ -1374,9 +1382,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
# If output_margin is active, simply return the scores
|
# If output_margin is active, simply return the scores
|
||||||
return class_probs
|
return class_probs
|
||||||
|
|
||||||
if len(class_probs.shape) > 1:
|
if len(class_probs.shape) > 1 and self.n_classes_ != 2:
|
||||||
# turns softprob into softmax
|
# multi-class, turns softprob into softmax
|
||||||
column_indexes: np.ndarray = np.argmax(class_probs, axis=1) # type: ignore
|
column_indexes: np.ndarray = np.argmax(class_probs, axis=1) # type: ignore
|
||||||
|
elif len(class_probs.shape) > 1 and class_probs.shape[1] != 1:
|
||||||
|
# multi-label
|
||||||
|
column_indexes = np.zeros(class_probs.shape)
|
||||||
|
column_indexes[class_probs > 0.5] = 1
|
||||||
else:
|
else:
|
||||||
# turns soft logit into class label
|
# turns soft logit into class label
|
||||||
column_indexes = np.repeat(0, class_probs.shape[0])
|
column_indexes = np.repeat(0, class_probs.shape[0])
|
||||||
|
|||||||
@ -1194,6 +1194,24 @@ def test_estimator_type():
|
|||||||
cls.load_model(path) # no error
|
cls.load_model(path) # no error
|
||||||
|
|
||||||
|
|
||||||
|
def test_multilabel_classification() -> None:
|
||||||
|
from sklearn.datasets import make_multilabel_classification
|
||||||
|
|
||||||
|
X, y = make_multilabel_classification(
|
||||||
|
n_samples=32, n_classes=5, n_labels=3, random_state=0
|
||||||
|
)
|
||||||
|
clf = xgb.XGBClassifier(tree_method="hist")
|
||||||
|
clf.fit(X, y)
|
||||||
|
booster = clf.get_booster()
|
||||||
|
learner = json.loads(booster.save_config())["learner"]
|
||||||
|
assert int(learner["learner_model_param"]["num_target"]) == 5
|
||||||
|
|
||||||
|
np.testing.assert_allclose(clf.predict(X), y)
|
||||||
|
predt = (clf.predict_proba(X) > 0.5).astype(np.int64)
|
||||||
|
np.testing.assert_allclose(clf.predict(X), predt)
|
||||||
|
assert predt.dtype == np.int64
|
||||||
|
|
||||||
|
|
||||||
def run_data_initialization(DMatrix, model, X, y):
|
def run_data_initialization(DMatrix, model, X, y):
|
||||||
"""Assert that we don't create duplicated DMatrix."""
|
"""Assert that we don't create duplicated DMatrix."""
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user