From b4ec1682c6754805e9cc2b4da290779f520003ac Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 19 Jan 2022 04:35:17 +0800 Subject: [PATCH] Update document for multi output and categorical. (#7574) * Group together categorical related parameters. * Update documents about multioutput and categorical. --- demo/guide-python/custom_rmsle.py | 2 +- demo/guide-python/multioutput_regression.py | 2 ++ doc/tutorials/categorical.rst | 2 +- doc/tutorials/multioutput.rst | 17 +++++++------- python-package/xgboost/sklearn.py | 26 +++++++++++---------- 5 files changed, 27 insertions(+), 22 deletions(-) diff --git a/demo/guide-python/custom_rmsle.py b/demo/guide-python/custom_rmsle.py index 66fbd83a0..bc21f9022 100644 --- a/demo/guide-python/custom_rmsle.py +++ b/demo/guide-python/custom_rmsle.py @@ -7,7 +7,7 @@ weight is not used in following example. In this script, we implement the Square Error (SLE) objective and RMSLE metric as customized functions, then compare it with native implementation in XGBoost. -See doc/tutorials/custom_metric_obj.rst for a step by step walkthrough, with other +See :doc:`/tutorials/custom_metric_obj` for a step by step walkthrough, with other details. The `SLE` objective reduces impact of outliers in training dataset, hence here we also diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py index a0d0998e6..f3f62609c 100644 --- a/demo/guide-python/multioutput_regression.py +++ b/demo/guide-python/multioutput_regression.py @@ -5,6 +5,8 @@ A demo for multi-output regression The demo is adopted from scikit-learn: https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py + +See :doc:`/tutorials/multioutput` for more information. """ import numpy as np import xgboost as xgb diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst index f302e5e47..c1d93fb45 100644 --- a/doc/tutorials/categorical.rst +++ b/doc/tutorials/categorical.rst @@ -113,7 +113,7 @@ Miscellaneous ************* By default, XGBoost assumes input categories are integers starting from 0 till the number -of categories :math:`[0, n_categories)`. However, user might provide inputs with invalid +of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid values due to mistakes or missing values. It can be negative value, integer values that can not be accurately represented by 32-bit floating point, or values that are larger than actual number of unique categories. During training this is validated but for prediction diff --git a/doc/tutorials/multioutput.rst b/doc/tutorials/multioutput.rst index d9af9313e..0be27ced0 100644 --- a/doc/tutorials/multioutput.rst +++ b/doc/tutorials/multioutput.rst @@ -12,14 +12,15 @@ terminologies related to different multi-output models please refer to the `scik user guide `_. Internally, XGBoost builds one model for each target similar to sklearn meta estimators, -with the added benefit of reusing data and custom objective support. For a worked example -of regression, see :ref:`sphx_glr_python_examples_multioutput_regression.py`. For -multi-label classification, the binary relevance strategy is used. Input ``y`` should be -of shape ``(n_samples, n_classes)`` with each column having a value of 0 or 1 to specify -whether the sample is labeled as positive for respective class. Given a sample with 3 -output classes and 2 labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with -the second class labeled as negative and the rest labeled as positive. At the moment -XGBoost supports only dense matrix for labels. +with the added benefit of reusing data and other integrated features like SHAP. For a +worked example of regression, see +:ref:`sphx_glr_python_examples_multioutput_regression.py`. For multi-label classification, +the binary relevance strategy is used. Input ``y`` should be of shape ``(n_samples, +n_classes)`` with each column having a value of 0 or 1 to specify whether the sample is +labeled as positive for respective class. Given a sample with 3 output classes and 2 +labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with the second class +labeled as negative and the rest labeled as positive. At the moment XGBoost supports only +dense matrix for labels. .. code-block:: python diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 54970af6d..374958f75 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -197,6 +197,18 @@ __model_doc = f''' Experimental support for categorical data. Do not set to true unless you are interested in development. Only valid when `gpu_hist` and dataframe are used. + max_cat_to_onehot : bool + + .. versionadded:: 1.6.0 + + .. note:: This parameter is experimental + + A threshold for deciding whether XGBoost should use one-hot encoding based split + for categorical data. When number of categories is lesser than the threshold then + one-hot encoding is chosen, otherwise the categories will be partitioned into + children nodes. Only relevant for regression and binary classification and + `approx` tree method. + eval_metric : Optional[Union[str, List[str], Callable]] .. versionadded:: 1.6.0 @@ -267,16 +279,6 @@ __model_doc = f''' callbacks = [xgb.callback.EarlyStopping(rounds=early_stopping_rounds, save_best=True)] - max_cat_to_onehot : bool - - .. versionadded:: 1.6.0 - - A threshold for deciding whether XGBoost should use one-hot encoding based split - for categorical data. When number of categories is lesser than the threshold then - one-hot encoding is chosen, otherwise the categories will be partitioned into - children nodes. Only relevant for regression and binary classification and - `approx` tree method. - kwargs : dict, optional Keyword arguments for XGBoost Booster object. Full documentation of parameters can be found :doc:`here `. @@ -490,10 +492,10 @@ class XGBModel(XGBModelBase): validate_parameters: Optional[bool] = None, predictor: Optional[str] = None, enable_categorical: bool = False, + max_cat_to_onehot: Optional[int] = None, eval_metric: Optional[Union[str, List[str], Callable]] = None, early_stopping_rounds: Optional[int] = None, callbacks: Optional[List[TrainingCallback]] = None, - max_cat_to_onehot: Optional[int] = None, **kwargs: Any ) -> None: if not SKLEARN_INSTALLED: @@ -530,10 +532,10 @@ class XGBModel(XGBModelBase): self.validate_parameters = validate_parameters self.predictor = predictor self.enable_categorical = enable_categorical + self.max_cat_to_onehot = max_cat_to_onehot self.eval_metric = eval_metric self.early_stopping_rounds = early_stopping_rounds self.callbacks = callbacks - self.max_cat_to_onehot = max_cat_to_onehot if kwargs: self.kwargs = kwargs