[doc] Small improvements for categorical data document. (#7330)

This commit is contained in:
Jiaming Yuan 2021-10-20 18:04:32 +08:00 committed by GitHub
parent f999897615
commit 15685996fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 16 deletions

View File

@ -7,6 +7,9 @@ https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
And the data can be found at: And the data can be found at:
https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data
Also, see the tutorial for using XGBoost with categorical data:
https://xgboost.readthedocs.io/en/latest/tutorials/categorical.html
.. versionadded 1.6.0 .. versionadded 1.6.0
""" """
@ -48,8 +51,6 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
for i in range(0, 6): for i in range(0, 6):
X["ord_" + str(i)] = X["ord_" + str(i)].astype("category") X["ord_" + str(i)] = X["ord_" + str(i)].astype("category")
print(X.shape)
print( print(
"train data set has got {} rows and {} columns".format(X.shape[0], X.shape[1]) "train data set has got {} rows and {} columns".format(X.shape[0], X.shape[1])
) )
@ -64,7 +65,7 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
X_train, X_test, y_train, y_test = train_test_split( X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=1994, test_size=0.2 X, y, random_state=1994, test_size=0.2
) )
# Specify `enable_categorical`.
clf = xgb.XGBClassifier(**params, enable_categorical=True) clf = xgb.XGBClassifier(**params, enable_categorical=True)
clf.fit( clf.fit(
X_train, X_train,
@ -72,7 +73,6 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
eval_set=[(X_test, y_test), (X_train, y_train)], eval_set=[(X_test, y_test), (X_train, y_train)],
eval_metric="auc", eval_metric="auc",
) )
print(clf.n_classes_)
clf.save_model(os.path.join(output_dir, "categorical.json")) clf.save_model(os.path.join(output_dir, "categorical.json"))
y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples
@ -82,12 +82,10 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
def onehot_encoding_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None: def onehot_encoding_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
"""Train using one-hot encoded data.""" """Train using one-hot encoded data."""
X_train, X_test, y_train, y_test = train_test_split( X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42, test_size=0.2 X, y, random_state=42, test_size=0.2
) )
print(X_train.shape, y_train.shape) # Specify `enable_categorical`.
clf = xgb.XGBClassifier(**params, enable_categorical=False) clf = xgb.XGBClassifier(**params, enable_categorical=False)
clf.fit( clf.fit(
X_train, X_train,

View File

@ -5,6 +5,9 @@ In before, users need to run an encoder themselves before passing the data into
which creates a sparse matrix and potentially increase memory usage. This demo showcases which creates a sparse matrix and potentially increase memory usage. This demo showcases
the experimental categorical data support, more advanced features are planned. the experimental categorical data support, more advanced features are planned.
Also, see the tutorial for using XGBoost with categorical data:
https://xgboost.readthedocs.io/en/latest/tutorials/categorical.html
.. versionadded:: 1.5.0 .. versionadded:: 1.5.0
""" """

View File

@ -58,10 +58,12 @@ can plot the model and calculate the global feature importance:
The ``scikit-learn`` interface from dask is similar to single node version. The basic The ``scikit-learn`` interface from dask is similar to single node version. The basic
idea is create dataframe with category feature type, and tell XGBoost to use ``gpu_hist`` idea is create dataframe with category feature type, and tell XGBoost to use ``gpu_hist``
with parameter ``enable_categorical``. See `this demo with parameter ``enable_categorical``. See `this demo
<https://github.com/dmlc/xgboost/blob/master/demo/guide-python/categorical.py>`_ for a <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/categorical.py>`__ for a
worked example using categorical data with ``scikit-learn`` interface. For using it with worked example of using categorical data with ``scikit-learn`` interface. A comparison
the Kaggle tutorial dataset, see `this demo between using one-hot encoded data and XGBoost's categorical data support can be found
<https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cat_in_the_dat.py>`_ `here
<https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cat_in_the_dat.py>`__.
********************** **********************
@ -70,10 +72,10 @@ Using native interface
The ``scikit-learn`` interface is user friendly, but lacks some features that are only The ``scikit-learn`` interface is user friendly, but lacks some features that are only
available in native interface. For instance users cannot compute SHAP value directly or available in native interface. For instance users cannot compute SHAP value directly or
use quantized ``DMatrix``. Also native interface supports data types other than use quantized :class:`DMatrix <xgboost.DMatrix>`. Also native interface supports data
dataframe, like ``numpy/cupy array``. To use the native interface with categorical data, types other than dataframe, like ``numpy/cupy array``. To use the native interface with
we need to pass the similar parameter to ``DMatrix`` and the ``train`` function. For categorical data, we need to pass the similar parameter to :class:`DMatrix
dataframe input: <xgboost.DMatrix>` and the :func:`train <xgboost.train>` function. For dataframe input:
.. code:: python .. code:: python
@ -106,7 +108,7 @@ types by using the ``feature_types`` parameter in :class:`DMatrix <xgboost.DMatr
For numerical data, the feature type can be ``"q"`` or ``"float"``, while for categorical For numerical data, the feature type can be ``"q"`` or ``"float"``, while for categorical
feature it's specified as ``"c"``. The Dask module in XGBoost has the same interface so feature it's specified as ``"c"``. The Dask module in XGBoost has the same interface so
``dask.Array`` can also be used as categorical data. :class:`dask.Array <dask.Array>` can also be used as categorical data.
********** **********