[doc] Mention data consistency for categorical features. (#9678)
This commit is contained in:
@@ -11,10 +11,13 @@ https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
|
||||
And the data can be found at:
|
||||
https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data
|
||||
|
||||
Also, see the tutorial for using XGBoost with categorical data:
|
||||
:doc:`/tutorials/categorical`.
|
||||
.. versionadded:: 1.6.0
|
||||
|
||||
.. versionadded 1.6.0
|
||||
See Also
|
||||
--------
|
||||
- :doc:`Tutorial </tutorials/categorical>`
|
||||
- :ref:`sphx_glr_python_examples_categorical.py`
|
||||
- :ref:`sphx_glr_python_examples_cat_pipeline.py`
|
||||
|
||||
"""
|
||||
|
||||
|
||||
145
demo/guide-python/cat_pipeline.py
Normal file
145
demo/guide-python/cat_pipeline.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""
|
||||
Feature engineering pipeline for categorical data
|
||||
=================================================
|
||||
|
||||
The script showcases how to keep the categorical data encoding consistent across
|
||||
training and inference. There are many ways to attain the same goal, this script can be
|
||||
used as a starting point.
|
||||
|
||||
See Also
|
||||
--------
|
||||
- :doc:`Tutorial </tutorials/categorical>`
|
||||
- :ref:`sphx_glr_python_examples_categorical.py`
|
||||
- :ref:`sphx_glr_python_examples_cat_in_the_dat.py`
|
||||
|
||||
"""
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.compose import make_column_selector, make_column_transformer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import OrdinalEncoder
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
|
||||
def make_example_data() -> Tuple[pd.DataFrame, pd.Series, List[str]]:
|
||||
"""Generate data for demo."""
|
||||
n_samples = 2048
|
||||
rng = np.random.default_rng(1994)
|
||||
|
||||
# We have three categorical features, while the rest are numerical.
|
||||
categorical_features = ["brand_id", "retailer_id", "category_id"]
|
||||
|
||||
df = pd.DataFrame(
|
||||
np.random.randint(32, 96, size=(n_samples, 3)),
|
||||
columns=categorical_features,
|
||||
)
|
||||
|
||||
df["price"] = rng.integers(100, 200, size=(n_samples,))
|
||||
df["stock_status"] = rng.choice([True, False], n_samples)
|
||||
df["on_sale"] = rng.choice([True, False], n_samples)
|
||||
df["label"] = rng.normal(loc=0.0, scale=1.0, size=n_samples)
|
||||
|
||||
X = df.drop(["label"], axis=1)
|
||||
y = df["label"]
|
||||
|
||||
return X, y, categorical_features
|
||||
|
||||
|
||||
def native() -> None:
|
||||
"""Using the native XGBoost interface."""
|
||||
X, y, cat_feats = make_example_data()
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, random_state=1994, test_size=0.2
|
||||
)
|
||||
|
||||
# Create an encoder based on training data.
|
||||
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
|
||||
enc.set_output(transform="pandas")
|
||||
enc = enc.fit(X_train[cat_feats])
|
||||
|
||||
def enc_transform(X: pd.DataFrame) -> pd.DataFrame:
|
||||
# don't make change inplace so that we can have demonstrations for encoding
|
||||
X = X.copy()
|
||||
cat_cols = enc.transform(X[cat_feats])
|
||||
for i, name in enumerate(cat_feats):
|
||||
# create pd.Series based on the encoder
|
||||
cat_cols[name] = pd.Categorical.from_codes(
|
||||
codes=cat_cols[name].astype(np.int32), categories=enc.categories_[i]
|
||||
)
|
||||
X[cat_feats] = cat_cols
|
||||
return X
|
||||
|
||||
# Encode the data based on fitted encoder.
|
||||
X_train_enc = enc_transform(X_train)
|
||||
X_test_enc = enc_transform(X_test)
|
||||
# Train XGBoost model using the native interface.
|
||||
Xy_train = xgb.QuantileDMatrix(X_train_enc, y_train, enable_categorical=True)
|
||||
Xy_test = xgb.QuantileDMatrix(
|
||||
X_test_enc, y_test, enable_categorical=True, ref=Xy_train
|
||||
)
|
||||
booster = xgb.train({}, Xy_train)
|
||||
booster.predict(Xy_test)
|
||||
|
||||
# Following shows that data are encoded consistently.
|
||||
|
||||
# We first obtain result from newly encoded data
|
||||
predt0 = booster.inplace_predict(enc_transform(X_train.head(16)))
|
||||
# then we obtain result from already encoded data from training.
|
||||
predt1 = booster.inplace_predict(X_train_enc.head(16))
|
||||
|
||||
np.testing.assert_allclose(predt0, predt1)
|
||||
|
||||
|
||||
def pipeline() -> None:
|
||||
"""Using the sklearn pipeline."""
|
||||
X, y, cat_feats = make_example_data()
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, random_state=3, test_size=0.2
|
||||
)
|
||||
|
||||
enc = make_column_transformer(
|
||||
(
|
||||
OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
|
||||
# all categorical feature names end with "_id"
|
||||
make_column_selector(pattern=".*_id"),
|
||||
),
|
||||
remainder="passthrough",
|
||||
verbose_feature_names_out=False,
|
||||
)
|
||||
# No need to set pandas output, we use `feature_types` to indicate the type of
|
||||
# features.
|
||||
|
||||
# enc.set_output(transform="pandas")
|
||||
|
||||
feature_types = ["c" if fn in cat_feats else "q" for fn in X_train.columns]
|
||||
reg = xgb.XGBRegressor(
|
||||
feature_types=feature_types, enable_categorical=True, n_estimators=10
|
||||
)
|
||||
p = make_pipeline(enc, reg)
|
||||
p.fit(X_train, y_train)
|
||||
# check XGBoost is using the feature type correctly.
|
||||
model_types = reg.get_booster().feature_types
|
||||
assert model_types is not None
|
||||
for a, b in zip(model_types, feature_types):
|
||||
assert a == b
|
||||
|
||||
# Following shows that data are encoded consistently.
|
||||
|
||||
# We first create a slice of data that doesn't contain all the categories
|
||||
predt0 = p.predict(X_train.iloc[:16, :])
|
||||
# Then we use the dataframe that contains all the categories
|
||||
predt1 = p.predict(X_train)[:16]
|
||||
|
||||
# The resulting encoding is the same
|
||||
np.testing.assert_allclose(predt0, predt1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pipeline()
|
||||
native()
|
||||
@@ -8,10 +8,13 @@ In before, users need to run an encoder themselves before passing the data into
|
||||
which creates a sparse matrix and potentially increase memory usage. This demo
|
||||
showcases the experimental categorical data support, more advanced features are planned.
|
||||
|
||||
Also, see :doc:`the tutorial </tutorials/categorical>` for using XGBoost with
|
||||
categorical data.
|
||||
.. versionadded:: 1.5.0
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
See Also
|
||||
--------
|
||||
- :doc:`Tutorial </tutorials/categorical>`
|
||||
- :ref:`sphx_glr_python_examples_cat_in_the_dat.py`
|
||||
- :ref:`sphx_glr_python_examples_cat_pipeline.py`
|
||||
|
||||
"""
|
||||
from typing import Tuple
|
||||
@@ -52,11 +55,13 @@ def make_categorical(
|
||||
|
||||
def main() -> None:
|
||||
# Use builtin categorical data support
|
||||
# For scikit-learn interface, the input data must be pandas DataFrame or cudf
|
||||
# DataFrame with categorical features
|
||||
|
||||
# For scikit-learn interface, the input data should be pandas DataFrame or cudf
|
||||
# DataFrame with categorical features. If an numpy/cupy array is used instead, the
|
||||
# `feature_types` for `XGBRegressor` should be set accordingly.
|
||||
X, y = make_categorical(100, 10, 4, False)
|
||||
# Specify `enable_categorical` to True, also we use onehot encoding based split
|
||||
# here for demonstration. For details see the document of `max_cat_to_onehot`.
|
||||
# Specify `enable_categorical` to True, also we use onehot-encoding-based split here
|
||||
# for demonstration. For details see the document of `max_cat_to_onehot`.
|
||||
reg = xgb.XGBRegressor(
|
||||
tree_method="hist", enable_categorical=True, max_cat_to_onehot=5, device="cuda"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user