Encode pandas categorical data automatically. (#7231)
This commit is contained in:
parent
32e0858501
commit
22d56cebf1
@ -220,50 +220,61 @@ _pandas_dtype_mapper = {
|
||||
|
||||
def _transform_pandas_df(
|
||||
data,
|
||||
enable_categorical,
|
||||
enable_categorical: bool,
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
meta=None,
|
||||
meta_type=None,
|
||||
):
|
||||
from pandas import MultiIndex, Int64Index, RangeIndex
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_sparse, is_categorical_dtype
|
||||
|
||||
data_dtypes = data.dtypes
|
||||
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
|
||||
(is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in data_dtypes):
|
||||
for dtype in data.dtypes):
|
||||
bad_fields = [
|
||||
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
|
||||
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
|
||||
if dtype.name not in _pandas_dtype_mapper
|
||||
]
|
||||
|
||||
msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When
|
||||
categorical type is supplied, DMatrix parameter
|
||||
`enable_categorical` must be set to `True`."""
|
||||
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
||||
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
||||
be set to `True`."""
|
||||
raise ValueError(msg + ', '.join(bad_fields))
|
||||
|
||||
# handle feature names
|
||||
if feature_names is None and meta is None:
|
||||
if isinstance(data.columns, MultiIndex):
|
||||
if isinstance(data.columns, pd.MultiIndex):
|
||||
feature_names = [
|
||||
' '.join([str(x) for x in i]) for i in data.columns
|
||||
]
|
||||
elif isinstance(data.columns, (Int64Index, RangeIndex)):
|
||||
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
|
||||
feature_names = list(map(str, data.columns))
|
||||
else:
|
||||
feature_names = data.columns.format()
|
||||
|
||||
# handle feature types
|
||||
if feature_types is None and meta is None:
|
||||
feature_types = []
|
||||
for dtype in data_dtypes:
|
||||
for i, dtype in enumerate(data.dtypes):
|
||||
if is_sparse(dtype):
|
||||
feature_types.append(_pandas_dtype_mapper[
|
||||
dtype.subtype.name])
|
||||
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
|
||||
elif is_categorical_dtype(dtype) and enable_categorical:
|
||||
feature_types.append(CAT_T)
|
||||
else:
|
||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||
|
||||
# handle categorical codes.
|
||||
transformed = pd.DataFrame()
|
||||
if enable_categorical:
|
||||
for i, dtype in enumerate(data.dtypes):
|
||||
if is_categorical_dtype(dtype):
|
||||
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
|
||||
else:
|
||||
transformed[data.columns[i]] = data[data.columns[i]]
|
||||
else:
|
||||
transformed = data
|
||||
|
||||
if meta and len(data.columns) > 1:
|
||||
raise ValueError(
|
||||
'DataFrame for {meta} cannot have multiple columns'.format(
|
||||
@ -271,10 +282,10 @@ def _transform_pandas_df(
|
||||
)
|
||||
|
||||
dtype = meta_type if meta_type else np.float32
|
||||
data = data.values
|
||||
arr = transformed.values
|
||||
if meta_type:
|
||||
data = data.astype(meta_type)
|
||||
return data, feature_names, feature_types
|
||||
arr = arr.astype(meta_type)
|
||||
return arr, feature_names, feature_types
|
||||
|
||||
|
||||
def _from_pandas_df(
|
||||
|
||||
@ -582,6 +582,8 @@ struct GPUHistMakerDevice {
|
||||
|
||||
auto is_cat = candidate.split.is_cat;
|
||||
if (is_cat) {
|
||||
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
|
||||
<< "Categorical feature value too large.";
|
||||
auto cat = common::AsCat(candidate.split.fvalue);
|
||||
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
|
||||
LBitField32 cats_bits(split_cats);
|
||||
|
||||
@ -130,6 +130,17 @@ class TestPandas:
|
||||
m = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
assert m.feature_types[0] == 'c'
|
||||
|
||||
X_0 = ["f", "o", "o"]
|
||||
X_1 = [4, 3, 2]
|
||||
X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1})
|
||||
X["feat_0"] = X["feat_0"].astype("category")
|
||||
transformed, _, feature_types = xgb.data._transform_pandas_df(
|
||||
X, enable_categorical=True
|
||||
)
|
||||
|
||||
assert np.issubdtype(transformed[:, 0].dtype, np.integer)
|
||||
assert transformed[:, 0].min() == 0
|
||||
|
||||
def test_pandas_sparse(self):
|
||||
import pandas as pd
|
||||
rows = 100
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user