Encode pandas categorical data automatically. (#7231)

This commit is contained in:
Jiaming Yuan 2021-09-17 11:09:55 +08:00 committed by GitHub
parent 32e0858501
commit 22d56cebf1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 40 additions and 16 deletions

View File

@ -220,50 +220,61 @@ _pandas_dtype_mapper = {
def _transform_pandas_df(
data,
enable_categorical,
enable_categorical: bool,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
meta=None,
meta_type=None,
):
from pandas import MultiIndex, Int64Index, RangeIndex
import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype
data_dtypes = data.dtypes
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
(is_categorical_dtype(dtype) and enable_categorical)
for dtype in data_dtypes):
for dtype in data.dtypes):
bad_fields = [
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
if dtype.name not in _pandas_dtype_mapper
]
msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When
categorical type is supplied, DMatrix parameter
`enable_categorical` must be set to `True`."""
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))
# handle feature names
if feature_names is None and meta is None:
if isinstance(data.columns, MultiIndex):
if isinstance(data.columns, pd.MultiIndex):
feature_names = [
' '.join([str(x) for x in i]) for i in data.columns
]
elif isinstance(data.columns, (Int64Index, RangeIndex)):
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
feature_names = list(map(str, data.columns))
else:
feature_names = data.columns.format()
# handle feature types
if feature_types is None and meta is None:
feature_types = []
for dtype in data_dtypes:
for i, dtype in enumerate(data.dtypes):
if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[
dtype.subtype.name])
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T)
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])
# handle categorical codes.
transformed = pd.DataFrame()
if enable_categorical:
for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype):
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
else:
transformed[data.columns[i]] = data[data.columns[i]]
else:
transformed = data
if meta and len(data.columns) > 1:
raise ValueError(
'DataFrame for {meta} cannot have multiple columns'.format(
@ -271,10 +282,10 @@ def _transform_pandas_df(
)
dtype = meta_type if meta_type else np.float32
data = data.values
arr = transformed.values
if meta_type:
data = data.astype(meta_type)
return data, feature_names, feature_types
arr = arr.astype(meta_type)
return arr, feature_names, feature_types
def _from_pandas_df(

View File

@ -582,6 +582,8 @@ struct GPUHistMakerDevice {
auto is_cat = candidate.split.is_cat;
if (is_cat) {
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
<< "Categorical feature value too large.";
auto cat = common::AsCat(candidate.split.fvalue);
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
LBitField32 cats_bits(split_cats);

View File

@ -130,6 +130,17 @@ class TestPandas:
m = xgb.DMatrix(X, y, enable_categorical=True)
assert m.feature_types[0] == 'c'
X_0 = ["f", "o", "o"]
X_1 = [4, 3, 2]
X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1})
X["feat_0"] = X["feat_0"].astype("category")
transformed, _, feature_types = xgb.data._transform_pandas_df(
X, enable_categorical=True
)
assert np.issubdtype(transformed[:, 0].dtype, np.integer)
assert transformed[:, 0].min() == 0
def test_pandas_sparse(self):
import pandas as pd
rows = 100