Update document for multi output and categorical. (#7574)

* Group together categorical related parameters. * Update documents about multioutput and categorical.
2022-01-19 04:35:17 +08:00 · 2022-01-19 04:35:17 +08:00 · b4ec1682c6
commit b4ec1682c6
parent dac9eb13bd
5 changed files with 27 additions and 22 deletions
--- a/demo/guide-python/custom_rmsle.py
+++ b/demo/guide-python/custom_rmsle.py
@ -7,7 +7,7 @@ weight is not used in following example. In this script, we implement the Square
 Error (SLE) objective and RMSLE metric as customized functions, then compare it with
 native implementation in XGBoost.

-See doc/tutorials/custom_metric_obj.rst for a step by step walkthrough, with other
+See :doc:`/tutorials/custom_metric_obj` for a step by step walkthrough, with other
 details.

 The `SLE` objective reduces impact of outliers in training dataset, hence here we also
--- a/demo/guide-python/multioutput_regression.py
+++ b/demo/guide-python/multioutput_regression.py
@ -5,6 +5,8 @@ A demo for multi-output regression
 The demo is adopted from scikit-learn:

 https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py
+
+See :doc:`/tutorials/multioutput` for more information.
 """
 import numpy as np
 import xgboost as xgb
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@ -113,7 +113,7 @@ Miscellaneous
 *************

 By default, XGBoost assumes input categories are integers starting from 0 till the number
-of categories :math:`[0, n_categories)`. However, user might provide inputs with invalid
+of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid
 values due to mistakes or missing values. It can be negative value, integer values that
 can not be accurately represented by 32-bit floating point, or values that are larger than
 actual number of unique categories.  During training this is validated but for prediction
--- a/doc/tutorials/multioutput.rst
+++ b/doc/tutorials/multioutput.rst
@ -12,14 +12,15 @@ terminologies related to different multi-output models please refer to the `scik
 user guide <https://scikit-learn.org/stable/modules/multiclass.HTML>`_.

 Internally, XGBoost builds one model for each target similar to sklearn meta estimators,
-with the added benefit of reusing data and custom objective support.  For a worked example
-of regression, see :ref:`sphx_glr_python_examples_multioutput_regression.py`. For
-multi-label classification, the binary relevance strategy is used.  Input ``y`` should be
-of shape ``(n_samples, n_classes)`` with each column having a value of 0 or 1 to specify
-whether the sample is labeled as positive for respective class. Given a sample with 3
-output classes and 2 labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with
-the second class labeled as negative and the rest labeled as positive. At the moment
-XGBoost supports only dense matrix for labels.
+with the added benefit of reusing data and other integrated features like SHAP.  For a
+worked example of regression, see
+:ref:`sphx_glr_python_examples_multioutput_regression.py`. For multi-label classification,
+the binary relevance strategy is used.  Input ``y`` should be of shape ``(n_samples,
+n_classes)`` with each column having a value of 0 or 1 to specify whether the sample is
+labeled as positive for respective class. Given a sample with 3 output classes and 2
+labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with the second class
+labeled as negative and the rest labeled as positive. At the moment XGBoost supports only
+dense matrix for labels.

 .. code-block:: python

--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -197,6 +197,18 @@ __model_doc = f'''
        Experimental support for categorical data.  Do not set to true unless you are
        interested in development. Only valid when `gpu_hist` and dataframe are used.

+    max_cat_to_onehot : bool
+
+        .. versionadded:: 1.6.0
+
+        .. note:: This parameter is experimental
+
+        A threshold for deciding whether XGBoost should use one-hot encoding based split
+        for categorical data.  When number of categories is lesser than the threshold then
+        one-hot encoding is chosen, otherwise the categories will be partitioned into
+        children nodes.  Only relevant for regression and binary classification and
+        `approx` tree method.
+
    eval_metric : Optional[Union[str, List[str], Callable]]

        .. versionadded:: 1.6.0
@ -267,16 +279,6 @@ __model_doc = f'''
            callbacks = [xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
                                                    save_best=True)]

-    max_cat_to_onehot : bool
-
-        .. versionadded:: 1.6.0
-
-        A threshold for deciding whether XGBoost should use one-hot encoding based split
-        for categorical data.  When number of categories is lesser than the threshold then
-        one-hot encoding is chosen, otherwise the categories will be partitioned into
-        children nodes.  Only relevant for regression and binary classification and
-        `approx` tree method.
-
    kwargs : dict, optional
        Keyword arguments for XGBoost Booster object.  Full documentation of parameters
        can be found :doc:`here </parameter>`.
@ -490,10 +492,10 @@ class XGBModel(XGBModelBase):
        validate_parameters: Optional[bool] = None,
        predictor: Optional[str] = None,
        enable_categorical: bool = False,
+        max_cat_to_onehot: Optional[int] = None,
        eval_metric: Optional[Union[str, List[str], Callable]] = None,
        early_stopping_rounds: Optional[int] = None,
        callbacks: Optional[List[TrainingCallback]] = None,
-        max_cat_to_onehot: Optional[int] = None,
        **kwargs: Any
    ) -> None:
        if not SKLEARN_INSTALLED:
@ -530,10 +532,10 @@ class XGBModel(XGBModelBase):
        self.validate_parameters = validate_parameters
        self.predictor = predictor
        self.enable_categorical = enable_categorical
+        self.max_cat_to_onehot = max_cat_to_onehot
        self.eval_metric = eval_metric
        self.early_stopping_rounds = early_stopping_rounds
        self.callbacks = callbacks
-        self.max_cat_to_onehot = max_cat_to_onehot
        if kwargs:
            self.kwargs = kwargs