Handle missing values in dataframe with category dtype. (#7331)

* Replace -1 in pandas initializer.
* Unify `IsValid` functor.
* Mimic pandas data handling in cuDF glue code.
* Check invalid categories.
* Fix DDM sketching.
This commit is contained in:
Jiaming Yuan
2021-10-28 03:33:54 +08:00
committed by GitHub
parent 2eee87423c
commit ac9bfaa4f2
13 changed files with 301 additions and 103 deletions

View File

@@ -373,7 +373,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
raise exc # pylint: disable=raising-bad-type
def __del__(self) -> None:
assert self._temporary_data is None, self._temporary_data
assert self._temporary_data is None
assert self._exception is None
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
@@ -397,19 +397,19 @@ class DataIter: # pylint: disable=too-many-instance-attributes
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
**kwargs: Any,
):
) -> None:
from .data import dispatch_proxy_set_data
from .data import _proxy_transform
transformed, feature_names, feature_types = _proxy_transform(
new, cat_codes, feature_names, feature_types = _proxy_transform(
data,
feature_names,
feature_types,
self._enable_categorical,
)
# Stage the data, meta info are copied inside C++ MetaInfo.
self._temporary_data = transformed
dispatch_proxy_set_data(self.proxy, transformed, self._allow_host)
self._temporary_data = (new, cat_codes)
dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
self.proxy.set_info(
feature_names=feature_names,
feature_types=feature_types,
@@ -1090,7 +1090,7 @@ class _ProxyDMatrix(DMatrix):
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
def _set_data_from_cuda_interface(self, data):
def _set_data_from_cuda_interface(self, data) -> None:
"""Set data from CUDA array interface."""
interface = data.__cuda_array_interface__
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
@@ -1098,11 +1098,11 @@ class _ProxyDMatrix(DMatrix):
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
)
def _set_data_from_cuda_columnar(self, data):
def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
"""Set data from CUDA columnar format."""
from .data import _cudf_array_interfaces
_, interfaces_str = _cudf_array_interfaces(data)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
def _set_data_from_array(self, data: np.ndarray):
@@ -2009,13 +2009,18 @@ class Booster(object):
from .data import _is_pandas_df, _transform_pandas_df
from .data import _array_interface
if _is_pandas_df(data):
if (
_is_pandas_df(data)
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
):
ft = self.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, enable_categorical)
if isinstance(data, np.ndarray):
from .data import _ensure_np_dtype
data, _ = _ensure_np_dtype(data, data.dtype)
@@ -2068,8 +2073,11 @@ class Booster(object):
)
return _prediction_output(shape, dims, preds, True)
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
from .data import _cudf_array_interfaces
_, interfaces_str = _cudf_array_interfaces(data)
from .data import _cudf_array_interfaces, _transform_cudf_df
data, cat_codes, _, _ = _transform_cudf_df(
data, None, None, enable_categorical
)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call(
_LIB.XGBoosterPredictFromCudaColumnar(
self.handle,

View File

@@ -1,4 +1,4 @@
# pylint: disable=too-many-arguments, too-many-branches
# pylint: disable=too-many-arguments, too-many-branches, too-many-lines
# pylint: disable=too-many-return-statements, import-error
'''Data dispatching for DMatrix.'''
import ctypes
@@ -12,7 +12,7 @@ import numpy as np
from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix
from .compat import lazy_isinstance
from .compat import lazy_isinstance, DataFrame
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
@@ -217,36 +217,48 @@ _pandas_dtype_mapper = {
}
def _invalid_dataframe_dtype(data) -> None:
# pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`.
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
bad_fields = [
str(data.columns[i])
for i, dtype in enumerate(data.dtypes)
if dtype.name not in _pandas_dtype_mapper
]
err = " Invalid columns:" + ", ".join(bad_fields)
else:
err = ""
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`.""" + err
raise ValueError(msg)
def _transform_pandas_df(
data,
data: DataFrame,
enable_categorical: bool,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
meta=None,
meta_type=None,
):
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
(is_categorical_dtype(dtype) and enable_categorical)
for dtype in data.dtypes):
bad_fields = [
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
if dtype.name not in _pandas_dtype_mapper
]
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))
if not all(
dtype.name in _pandas_dtype_mapper
or is_sparse(dtype)
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in data.dtypes
):
_invalid_dataframe_dtype(data)
# handle feature names
if feature_names is None and meta is None:
if isinstance(data.columns, pd.MultiIndex):
feature_names = [
' '.join([str(x) for x in i]) for i in data.columns
]
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
feature_names = list(map(str, data.columns))
else:
@@ -263,21 +275,24 @@ def _transform_pandas_df(
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])
# handle categorical codes.
# handle category codes.
transformed = pd.DataFrame()
if enable_categorical:
for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype):
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
# pandas uses -1 as default missing value for categorical data
transformed[data.columns[i]] = (
data[data.columns[i]]
.cat.codes.astype(np.float32)
.replace(-1.0, np.NaN)
)
else:
transformed[data.columns[i]] = data[data.columns[i]]
else:
transformed = data
if meta and len(data.columns) > 1:
raise ValueError(
f"DataFrame for {meta} cannot have multiple columns"
)
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
dtype = meta_type if meta_type else np.float32
arr = transformed.values
@@ -287,7 +302,7 @@ def _transform_pandas_df(
def _from_pandas_df(
data,
data: DataFrame,
enable_categorical: bool,
missing: float,
nthread: int,
@@ -299,6 +314,7 @@ def _from_pandas_df(
)
return _from_numpy_array(data, missing, nthread, feature_names, feature_types)
def _is_pandas_series(data):
try:
import pandas as pd
@@ -317,13 +333,26 @@ def _is_modin_series(data):
def _from_pandas_series(
data,
missing,
nthread,
missing: float,
nthread: int,
enable_categorical: bool,
feature_names: Optional[List[str]],
feature_types: Optional[List[str]],
):
from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not (
is_categorical_dtype(data.dtype) and enable_categorical
):
_invalid_dataframe_dtype(data)
if enable_categorical and is_categorical_dtype(data.dtype):
data = data.cat.codes
return _from_numpy_array(
data.values.astype("float"), missing, nthread, feature_names, feature_types
data.values.reshape(data.shape[0], 1).astype("float"),
missing,
nthread,
feature_names,
feature_types,
)
@@ -427,7 +456,7 @@ def _is_cudf_df(data):
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
data and a list of array interfaces. The data is list of categorical codes that
caller can safely ignore, but have to keep their reference alive until usage of array
@@ -439,23 +468,27 @@ def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
except ImportError:
from cudf.utils.dtypes import is_categorical_dtype
cat_codes = []
interfaces = []
if _is_cudf_ser(data):
interfaces.append(data.__cuda_array_interface__)
if is_categorical_dtype(data.dtype):
interface = cat_codes[0].__cuda_array_interface__
else:
interface = data.__cuda_array_interface__
if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface)
else:
for col in data:
for i, col in enumerate(data):
if is_categorical_dtype(data[col].dtype):
codes = data[col].cat.codes
codes = cat_codes[i]
interface = codes.__cuda_array_interface__
cat_codes.append(codes)
else:
interface = data[col].__cuda_array_interface__
if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface)
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
return cat_codes, interfaces_str
return interfaces_str
def _transform_cudf_df(
@@ -469,25 +502,57 @@ def _transform_cudf_df(
except ImportError:
from cudf.utils.dtypes import is_categorical_dtype
if _is_cudf_ser(data):
dtypes = [data.dtype]
else:
dtypes = data.dtypes
if not all(
dtype.name in _pandas_dtype_mapper
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in dtypes
):
_invalid_dataframe_dtype(data)
# handle feature names
if feature_names is None:
if _is_cudf_ser(data):
feature_names = [data.name]
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
elif (
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
# Unique to cuDF, no equivalence in pandas 1.3.3
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
):
feature_names = list(map(str, data.columns))
else:
feature_names = data.columns.format()
# handle feature types
if feature_types is None:
feature_types = []
if _is_cudf_ser(data):
dtypes = [data.dtype]
else:
dtypes = data.dtypes
for dtype in dtypes:
if is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T)
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])
return data, feature_names, feature_types
# handle categorical data
cat_codes = []
if _is_cudf_ser(data):
# unlike pandas, cuDF uses NA for missing data.
if is_categorical_dtype(data.dtype) and enable_categorical:
codes = data.cat.codes
cat_codes.append(codes)
else:
for col in data:
if is_categorical_dtype(data[col].dtype) and enable_categorical:
codes = data[col].cat.codes
cat_codes.append(codes)
return data, cat_codes, feature_names, feature_types
def _from_cudf_df(
@@ -498,10 +563,10 @@ def _from_cudf_df(
feature_types: Optional[List[str]],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]:
data, feature_names, feature_types = _transform_cudf_df(
data, cat_codes, feature_names, feature_types = _transform_cudf_df(
data, feature_names, feature_types, enable_categorical
)
_, interfaces_str = _cudf_array_interfaces(data)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
handle = ctypes.c_void_p()
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
_check_call(
@@ -706,8 +771,9 @@ def dispatch_data_backend(
return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types)
if _is_pandas_series(data):
return _from_pandas_series(data, missing, threads, feature_names,
feature_types)
return _from_pandas_series(
data, missing, threads, enable_categorical, feature_names, feature_types
)
if _is_cudf_df(data) or _is_cudf_ser(data):
return _from_cudf_df(
data, missing, threads, feature_names, feature_types, enable_categorical
@@ -731,8 +797,9 @@ def dispatch_data_backend(
return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types)
if _is_modin_series(data):
return _from_pandas_series(data, missing, threads, feature_names,
feature_types)
return _from_pandas_series(
data, missing, threads, enable_categorical, feature_names, feature_types
)
if _has_array_protocol(data):
array = np.asarray(data)
return _from_numpy_array(array, missing, threads, feature_names, feature_types)
@@ -865,8 +932,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
_meta_from_dt(data, name, dtype, handle)
return
if _is_modin_df(data):
data, _, _ = _transform_pandas_df(
data, False, meta=name, meta_type=dtype)
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
return
if _is_modin_series(data):
@@ -916,30 +982,38 @@ def _proxy_transform(
)
if _is_cupy_array(data):
data = _transform_cupy_array(data)
return data, feature_names, feature_types
return data, None, feature_names, feature_types
if _is_dlpack(data):
return _transform_dlpack(data), feature_names, feature_types
return _transform_dlpack(data), None, feature_names, feature_types
if _is_numpy_array(data):
return data, feature_names, feature_types
return data, None, feature_names, feature_types
if _is_scipy_csr(data):
return data, feature_names, feature_types
return data, None, feature_names, feature_types
if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
return arr, feature_names, feature_types
return arr, None, feature_names, feature_types
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None:
def dispatch_proxy_set_data(
proxy: _ProxyDMatrix,
data: Any,
cat_codes: Optional[list],
allow_host: bool,
) -> None:
"""Dispatch for DeviceQuantileDMatrix."""
if not _is_cudf_ser(data) and not _is_pandas_series(data):
_check_data_shape(data)
if _is_cudf_df(data):
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
# pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes)
return
if _is_cudf_ser(data):
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
# pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes)
return
if _is_cupy_array(data):
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212