Enable categorical data support on Python DMatrix. (#6166)
* Only pandas is recognized.
This commit is contained in:
parent
52c0b3f100
commit
7622b8cdb8
@ -384,7 +384,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
|||||||
silent=False,
|
silent=False,
|
||||||
feature_names=None,
|
feature_names=None,
|
||||||
feature_types=None,
|
feature_types=None,
|
||||||
nthread=None):
|
nthread=None,
|
||||||
|
enable_categorical=False):
|
||||||
"""Parameters
|
"""Parameters
|
||||||
----------
|
----------
|
||||||
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
|
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
|
||||||
@ -419,6 +420,16 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
|||||||
Number of threads to use for loading data when parallelization is
|
Number of threads to use for loading data when parallelization is
|
||||||
applicable. If -1, uses maximum threads available on the system.
|
applicable. If -1, uses maximum threads available on the system.
|
||||||
|
|
||||||
|
enable_categorical: boolean, optional
|
||||||
|
|
||||||
|
.. versionadded:: 1.3.0
|
||||||
|
|
||||||
|
Experimental support of specializing for categorical features. Do
|
||||||
|
not set to True unless you are interested in development.
|
||||||
|
Currently it's only available for `gpu_hist` tree method with 1 vs
|
||||||
|
rest (one hot) categorical split. Also, JSON serialization format,
|
||||||
|
`gpu_predictor` and pandas input are required.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
raise TypeError('Input data can not be a list.')
|
raise TypeError('Input data can not be a list.')
|
||||||
@ -437,7 +448,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
|||||||
data, missing=self.missing,
|
data, missing=self.missing,
|
||||||
threads=self.nthread,
|
threads=self.nthread,
|
||||||
feature_names=feature_names,
|
feature_names=feature_names,
|
||||||
feature_types=feature_types)
|
feature_types=feature_types,
|
||||||
|
enable_categorical=enable_categorical)
|
||||||
assert handle is not None
|
assert handle is not None
|
||||||
self.handle = handle
|
self.handle = handle
|
||||||
|
|
||||||
|
|||||||
@ -184,20 +184,24 @@ _pandas_dtype_mapper = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _transform_pandas_df(data, feature_names=None, feature_types=None,
|
def _transform_pandas_df(data, enable_categorical,
|
||||||
|
feature_names=None, feature_types=None,
|
||||||
meta=None, meta_type=None):
|
meta=None, meta_type=None):
|
||||||
from pandas import MultiIndex, Int64Index
|
from pandas import MultiIndex, Int64Index
|
||||||
from pandas.api.types import is_sparse
|
from pandas.api.types import is_sparse, is_categorical
|
||||||
|
|
||||||
data_dtypes = data.dtypes
|
data_dtypes = data.dtypes
|
||||||
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype)
|
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
|
||||||
|
(is_categorical(dtype) and enable_categorical)
|
||||||
for dtype in data_dtypes):
|
for dtype in data_dtypes):
|
||||||
bad_fields = [
|
bad_fields = [
|
||||||
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
|
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
|
||||||
if dtype.name not in _pandas_dtype_mapper
|
if dtype.name not in _pandas_dtype_mapper
|
||||||
]
|
]
|
||||||
|
|
||||||
msg = """DataFrame.dtypes for data must be int, float or bool.
|
msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When
|
||||||
Did not expect the data types in fields """
|
categorical type is supplied, DMatrix parameter
|
||||||
|
`enable_categorical` must be set to `True`."""
|
||||||
raise ValueError(msg + ', '.join(bad_fields))
|
raise ValueError(msg + ', '.join(bad_fields))
|
||||||
|
|
||||||
if feature_names is None and meta is None:
|
if feature_names is None and meta is None:
|
||||||
@ -216,6 +220,8 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
|
|||||||
if is_sparse(dtype):
|
if is_sparse(dtype):
|
||||||
feature_types.append(_pandas_dtype_mapper[
|
feature_types.append(_pandas_dtype_mapper[
|
||||||
dtype.subtype.name])
|
dtype.subtype.name])
|
||||||
|
elif is_categorical(dtype) and enable_categorical:
|
||||||
|
feature_types.append('categorical')
|
||||||
else:
|
else:
|
||||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||||
|
|
||||||
@ -226,13 +232,13 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
|
|||||||
|
|
||||||
dtype = meta_type if meta_type else np.float32
|
dtype = meta_type if meta_type else np.float32
|
||||||
data = np.ascontiguousarray(data.values, dtype=dtype)
|
data = np.ascontiguousarray(data.values, dtype=dtype)
|
||||||
|
|
||||||
return data, feature_names, feature_types
|
return data, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
def _from_pandas_df(data, missing, nthread, feature_names, feature_types):
|
def _from_pandas_df(data, enable_categorical, missing, nthread,
|
||||||
|
feature_names, feature_types):
|
||||||
data, feature_names, feature_types = _transform_pandas_df(
|
data, feature_names, feature_types = _transform_pandas_df(
|
||||||
data, feature_names, feature_types)
|
data, enable_categorical, feature_names, feature_types)
|
||||||
return _from_numpy_array(data, missing, nthread, feature_names,
|
return _from_numpy_array(data, missing, nthread, feature_names,
|
||||||
feature_types)
|
feature_types)
|
||||||
|
|
||||||
@ -244,6 +250,7 @@ def _is_pandas_series(data):
|
|||||||
return False
|
return False
|
||||||
return isinstance(data, pd.Series)
|
return isinstance(data, pd.Series)
|
||||||
|
|
||||||
|
|
||||||
def _is_modin_series(data):
|
def _is_modin_series(data):
|
||||||
try:
|
try:
|
||||||
import modin.pandas as pd
|
import modin.pandas as pd
|
||||||
@ -507,7 +514,8 @@ def _has_array_protocol(data):
|
|||||||
|
|
||||||
|
|
||||||
def dispatch_data_backend(data, missing, threads,
|
def dispatch_data_backend(data, missing, threads,
|
||||||
feature_names, feature_types):
|
feature_names, feature_types,
|
||||||
|
enable_categorical=False):
|
||||||
'''Dispatch data for DMatrix.'''
|
'''Dispatch data for DMatrix.'''
|
||||||
if _is_scipy_csr(data):
|
if _is_scipy_csr(data):
|
||||||
return _from_scipy_csr(data, missing, feature_names, feature_types)
|
return _from_scipy_csr(data, missing, feature_names, feature_types)
|
||||||
@ -525,7 +533,7 @@ def dispatch_data_backend(data, missing, threads,
|
|||||||
if _is_tuple(data):
|
if _is_tuple(data):
|
||||||
return _from_tuple(data, missing, feature_names, feature_types)
|
return _from_tuple(data, missing, feature_names, feature_types)
|
||||||
if _is_pandas_df(data):
|
if _is_pandas_df(data):
|
||||||
return _from_pandas_df(data, missing, threads,
|
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||||
feature_names, feature_types)
|
feature_names, feature_types)
|
||||||
if _is_pandas_series(data):
|
if _is_pandas_series(data):
|
||||||
return _from_pandas_series(data, missing, threads, feature_names,
|
return _from_pandas_series(data, missing, threads, feature_names,
|
||||||
@ -551,7 +559,7 @@ def dispatch_data_backend(data, missing, threads,
|
|||||||
return _from_dt_df(data, missing, threads, feature_names,
|
return _from_dt_df(data, missing, threads, feature_names,
|
||||||
feature_types)
|
feature_types)
|
||||||
if _is_modin_df(data):
|
if _is_modin_df(data):
|
||||||
return _from_pandas_df(data, missing, threads,
|
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||||
feature_names, feature_types)
|
feature_names, feature_types)
|
||||||
if _is_modin_series(data):
|
if _is_modin_series(data):
|
||||||
return _from_pandas_series(data, missing, threads, feature_names,
|
return _from_pandas_series(data, missing, threads, feature_names,
|
||||||
@ -655,7 +663,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
|
|||||||
_meta_from_numpy(data, name, dtype, handle)
|
_meta_from_numpy(data, name, dtype, handle)
|
||||||
return
|
return
|
||||||
if _is_pandas_df(data):
|
if _is_pandas_df(data):
|
||||||
data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype)
|
data, _, _ = _transform_pandas_df(data, False, meta=name,
|
||||||
|
meta_type=dtype)
|
||||||
_meta_from_numpy(data, name, dtype, handle)
|
_meta_from_numpy(data, name, dtype, handle)
|
||||||
return
|
return
|
||||||
if _is_pandas_series(data):
|
if _is_pandas_series(data):
|
||||||
@ -680,7 +689,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
|
|||||||
_meta_from_dt(data, name, dtype, handle)
|
_meta_from_dt(data, name, dtype, handle)
|
||||||
return
|
return
|
||||||
if _is_modin_df(data):
|
if _is_modin_df(data):
|
||||||
data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype)
|
data, _, _ = _transform_pandas_df(
|
||||||
|
data, False, meta=name, meta_type=dtype)
|
||||||
_meta_from_numpy(data, name, dtype, handle)
|
_meta_from_numpy(data, name, dtype, handle)
|
||||||
return
|
return
|
||||||
if _is_modin_series(data):
|
if _is_modin_series(data):
|
||||||
|
|||||||
@ -67,7 +67,8 @@ class TestModin(unittest.TestCase):
|
|||||||
# 0 1 1 0 0
|
# 0 1 1 0 0
|
||||||
# 1 2 0 1 0
|
# 1 2 0 1 0
|
||||||
# 2 3 0 0 1
|
# 2 3 0 0 1
|
||||||
result, _, _ = xgb.data._transform_pandas_df(dummies)
|
result, _, _ = xgb.data._transform_pandas_df(dummies,
|
||||||
|
enable_categorical=False)
|
||||||
exp = np.array([[1., 1., 0., 0.],
|
exp = np.array([[1., 1., 0., 0.],
|
||||||
[2., 0., 1., 0.],
|
[2., 0., 1., 0.],
|
||||||
[3., 0., 0., 1.]])
|
[3., 0., 0., 1.]])
|
||||||
@ -113,15 +114,15 @@ class TestModin(unittest.TestCase):
|
|||||||
# label must be a single column
|
# label must be a single column
|
||||||
df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
||||||
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
||||||
None, None, 'label', 'float')
|
False, None, None, 'label', 'float')
|
||||||
|
|
||||||
# label must be supported dtype
|
# label must be supported dtype
|
||||||
df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
||||||
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
||||||
None, None, 'label', 'float')
|
False, None, None, 'label', 'float')
|
||||||
|
|
||||||
df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
||||||
result, _, _ = xgb.data._transform_pandas_df(df, None, None,
|
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
|
||||||
'label', 'float')
|
'label', 'float')
|
||||||
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
|
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
|
||||||
dtype=float))
|
dtype=float))
|
||||||
|
|||||||
@ -67,7 +67,8 @@ class TestPandas(unittest.TestCase):
|
|||||||
# 0 1 1 0 0
|
# 0 1 1 0 0
|
||||||
# 1 2 0 1 0
|
# 1 2 0 1 0
|
||||||
# 2 3 0 0 1
|
# 2 3 0 0 1
|
||||||
result, _, _ = xgb.data._transform_pandas_df(dummies)
|
result, _, _ = xgb.data._transform_pandas_df(dummies,
|
||||||
|
enable_categorical=False)
|
||||||
exp = np.array([[1., 1., 0., 0.],
|
exp = np.array([[1., 1., 0., 0.],
|
||||||
[2., 0., 1., 0.],
|
[2., 0., 1., 0.],
|
||||||
[3., 0., 0., 1.]])
|
[3., 0., 0., 1.]])
|
||||||
@ -109,6 +110,16 @@ class TestPandas(unittest.TestCase):
|
|||||||
assert dm.num_row() == 2
|
assert dm.num_row() == 2
|
||||||
assert dm.num_col() == 6
|
assert dm.num_col() == 6
|
||||||
|
|
||||||
|
def test_pandas_categorical(self):
|
||||||
|
rng = np.random.RandomState(1994)
|
||||||
|
rows = 100
|
||||||
|
X = rng.randint(3, 7, size=rows)
|
||||||
|
X = pd.Series(X, dtype="category")
|
||||||
|
X = pd.DataFrame({'f0': X})
|
||||||
|
y = rng.randn(rows)
|
||||||
|
m = xgb.DMatrix(X, y, enable_categorical=True)
|
||||||
|
assert m.feature_types[0] == 'categorical'
|
||||||
|
|
||||||
def test_pandas_sparse(self):
|
def test_pandas_sparse(self):
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
rows = 100
|
rows = 100
|
||||||
@ -129,15 +140,15 @@ class TestPandas(unittest.TestCase):
|
|||||||
# label must be a single column
|
# label must be a single column
|
||||||
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
|
||||||
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
||||||
None, None, 'label', 'float')
|
False, None, None, 'label', 'float')
|
||||||
|
|
||||||
# label must be supported dtype
|
# label must be supported dtype
|
||||||
df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
|
||||||
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
|
||||||
None, None, 'label', 'float')
|
False, None, None, 'label', 'float')
|
||||||
|
|
||||||
df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
|
||||||
result, _, _ = xgb.data._transform_pandas_df(df, None, None,
|
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
|
||||||
'label', 'float')
|
'label', 'float')
|
||||||
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
|
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
|
||||||
dtype=float))
|
dtype=float))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user