* Handle missing values in dataframe with category dtype. (#7331) * Replace -1 in pandas initializer. * Unify `IsValid` functor. * Mimic pandas data handling in cuDF glue code. * Check invalid categories. * Fix DDM sketching. * Fix pick error.
This commit is contained in:
@@ -386,7 +386,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
||||
raise exc # pylint: disable=raising-bad-type
|
||||
|
||||
def __del__(self) -> None:
|
||||
assert self._temporary_data is None, self._temporary_data
|
||||
assert self._temporary_data is None
|
||||
assert self._exception is None
|
||||
|
||||
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
|
||||
@@ -410,19 +410,19 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
) -> None:
|
||||
from .data import dispatch_proxy_set_data
|
||||
from .data import _proxy_transform
|
||||
|
||||
transformed, feature_names, feature_types = _proxy_transform(
|
||||
new, cat_codes, feature_names, feature_types = _proxy_transform(
|
||||
data,
|
||||
feature_names,
|
||||
feature_types,
|
||||
self._enable_categorical,
|
||||
)
|
||||
# Stage the data, meta info are copied inside C++ MetaInfo.
|
||||
self._temporary_data = transformed
|
||||
dispatch_proxy_set_data(self.proxy, transformed, self._allow_host)
|
||||
self._temporary_data = (new, cat_codes)
|
||||
dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
|
||||
self.proxy.set_info(
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types,
|
||||
@@ -1103,7 +1103,7 @@ class _ProxyDMatrix(DMatrix):
|
||||
self.handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
|
||||
|
||||
def _set_data_from_cuda_interface(self, data):
|
||||
def _set_data_from_cuda_interface(self, data) -> None:
|
||||
"""Set data from CUDA array interface."""
|
||||
interface = data.__cuda_array_interface__
|
||||
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
|
||||
@@ -1111,11 +1111,11 @@ class _ProxyDMatrix(DMatrix):
|
||||
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
|
||||
)
|
||||
|
||||
def _set_data_from_cuda_columnar(self, data):
|
||||
def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
|
||||
"""Set data from CUDA columnar format."""
|
||||
from .data import _cudf_array_interfaces
|
||||
|
||||
_, interfaces_str = _cudf_array_interfaces(data)
|
||||
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
|
||||
|
||||
def _set_data_from_array(self, data: np.ndarray):
|
||||
@@ -1986,13 +1986,6 @@ class Booster(object):
|
||||
preds = ctypes.POINTER(ctypes.c_float)()
|
||||
|
||||
# once caching is supported, we can pass id(data) as cache id.
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
if isinstance(data, pd.DataFrame):
|
||||
data = data.values
|
||||
except ImportError:
|
||||
pass
|
||||
args = {
|
||||
"type": 0,
|
||||
"training": False,
|
||||
@@ -2027,7 +2020,20 @@ class Booster(object):
|
||||
f"got {data.shape[1]}"
|
||||
)
|
||||
|
||||
from .data import _is_pandas_df, _transform_pandas_df
|
||||
from .data import _array_interface
|
||||
if (
|
||||
_is_pandas_df(data)
|
||||
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||
):
|
||||
ft = self.feature_types
|
||||
if ft is None:
|
||||
enable_categorical = False
|
||||
else:
|
||||
enable_categorical = any(f == "c" for f in ft)
|
||||
if _is_pandas_df(data):
|
||||
data, _, _ = _transform_pandas_df(data, enable_categorical)
|
||||
|
||||
if isinstance(data, np.ndarray):
|
||||
from .data import _ensure_np_dtype
|
||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||
@@ -2080,9 +2086,11 @@ class Booster(object):
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, True)
|
||||
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
||||
from .data import _cudf_array_interfaces
|
||||
|
||||
_, interfaces_str = _cudf_array_interfaces(data)
|
||||
from .data import _cudf_array_interfaces, _transform_cudf_df
|
||||
data, cat_codes, _, _ = _transform_cudf_df(
|
||||
data, None, None, enable_categorical
|
||||
)
|
||||
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromCudaColumnar(
|
||||
self.handle,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# pylint: disable=too-many-arguments, too-many-branches
|
||||
# pylint: disable=too-many-arguments, too-many-branches, too-many-lines
|
||||
# pylint: disable=too-many-return-statements, import-error
|
||||
'''Data dispatching for DMatrix.'''
|
||||
import ctypes
|
||||
@@ -12,7 +12,7 @@ import numpy as np
|
||||
from .core import c_array, _LIB, _check_call, c_str
|
||||
from .core import _cuda_array_interface
|
||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
||||
from .compat import lazy_isinstance
|
||||
from .compat import lazy_isinstance, DataFrame
|
||||
|
||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||
|
||||
@@ -217,36 +217,48 @@ _pandas_dtype_mapper = {
|
||||
}
|
||||
|
||||
|
||||
def _invalid_dataframe_dtype(data) -> None:
|
||||
# pandas series has `dtypes` but it's just a single object
|
||||
# cudf series doesn't have `dtypes`.
|
||||
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
|
||||
bad_fields = [
|
||||
str(data.columns[i])
|
||||
for i, dtype in enumerate(data.dtypes)
|
||||
if dtype.name not in _pandas_dtype_mapper
|
||||
]
|
||||
err = " Invalid columns:" + ", ".join(bad_fields)
|
||||
else:
|
||||
err = ""
|
||||
|
||||
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
||||
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
||||
be set to `True`.""" + err
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def _transform_pandas_df(
|
||||
data,
|
||||
data: DataFrame,
|
||||
enable_categorical: bool,
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
meta=None,
|
||||
meta_type=None,
|
||||
):
|
||||
meta: Optional[str] = None,
|
||||
meta_type: Optional[str] = None,
|
||||
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_sparse, is_categorical_dtype
|
||||
|
||||
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
|
||||
(is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in data.dtypes):
|
||||
bad_fields = [
|
||||
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
|
||||
if dtype.name not in _pandas_dtype_mapper
|
||||
]
|
||||
|
||||
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
||||
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
||||
be set to `True`."""
|
||||
raise ValueError(msg + ', '.join(bad_fields))
|
||||
if not all(
|
||||
dtype.name in _pandas_dtype_mapper
|
||||
or is_sparse(dtype)
|
||||
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in data.dtypes
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
|
||||
# handle feature names
|
||||
if feature_names is None and meta is None:
|
||||
if isinstance(data.columns, pd.MultiIndex):
|
||||
feature_names = [
|
||||
' '.join([str(x) for x in i]) for i in data.columns
|
||||
]
|
||||
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
|
||||
feature_names = list(map(str, data.columns))
|
||||
else:
|
||||
@@ -263,21 +275,24 @@ def _transform_pandas_df(
|
||||
else:
|
||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||
|
||||
# handle categorical codes.
|
||||
# handle category codes.
|
||||
transformed = pd.DataFrame()
|
||||
if enable_categorical:
|
||||
for i, dtype in enumerate(data.dtypes):
|
||||
if is_categorical_dtype(dtype):
|
||||
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
|
||||
# pandas uses -1 as default missing value for categorical data
|
||||
transformed[data.columns[i]] = (
|
||||
data[data.columns[i]]
|
||||
.cat.codes.astype(np.float32)
|
||||
.replace(-1.0, np.NaN)
|
||||
)
|
||||
else:
|
||||
transformed[data.columns[i]] = data[data.columns[i]]
|
||||
else:
|
||||
transformed = data
|
||||
|
||||
if meta and len(data.columns) > 1:
|
||||
raise ValueError(
|
||||
f"DataFrame for {meta} cannot have multiple columns"
|
||||
)
|
||||
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||
|
||||
dtype = meta_type if meta_type else np.float32
|
||||
arr = transformed.values
|
||||
@@ -287,7 +302,7 @@ def _transform_pandas_df(
|
||||
|
||||
|
||||
def _from_pandas_df(
|
||||
data,
|
||||
data: DataFrame,
|
||||
enable_categorical: bool,
|
||||
missing,
|
||||
nthread,
|
||||
@@ -300,6 +315,7 @@ def _from_pandas_df(
|
||||
feature_types)
|
||||
|
||||
|
||||
|
||||
def _is_pandas_series(data):
|
||||
try:
|
||||
import pandas as pd
|
||||
@@ -318,13 +334,26 @@ def _is_modin_series(data):
|
||||
|
||||
def _from_pandas_series(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
missing: float,
|
||||
nthread: int,
|
||||
enable_categorical: bool,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
from pandas.api.types import is_categorical_dtype
|
||||
|
||||
if (data.dtype.name not in _pandas_dtype_mapper) and not (
|
||||
is_categorical_dtype(data.dtype) and enable_categorical
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
if enable_categorical and is_categorical_dtype(data.dtype):
|
||||
data = data.cat.codes
|
||||
return _from_numpy_array(
|
||||
data.values.astype("float"), missing, nthread, feature_names, feature_types
|
||||
data.values.reshape(data.shape[0], 1).astype("float"),
|
||||
missing,
|
||||
nthread,
|
||||
feature_names,
|
||||
feature_types,
|
||||
)
|
||||
|
||||
|
||||
@@ -428,7 +457,7 @@ def _is_cudf_df(data):
|
||||
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
|
||||
|
||||
|
||||
def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
|
||||
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
|
||||
data and a list of array interfaces. The data is list of categorical codes that
|
||||
caller can safely ignore, but have to keep their reference alive until usage of array
|
||||
@@ -440,23 +469,27 @@ def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
|
||||
except ImportError:
|
||||
from cudf.utils.dtypes import is_categorical_dtype
|
||||
|
||||
cat_codes = []
|
||||
interfaces = []
|
||||
if _is_cudf_ser(data):
|
||||
interfaces.append(data.__cuda_array_interface__)
|
||||
if is_categorical_dtype(data.dtype):
|
||||
interface = cat_codes[0].__cuda_array_interface__
|
||||
else:
|
||||
interface = data.__cuda_array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
interfaces.append(interface)
|
||||
else:
|
||||
for col in data:
|
||||
for i, col in enumerate(data):
|
||||
if is_categorical_dtype(data[col].dtype):
|
||||
codes = data[col].cat.codes
|
||||
codes = cat_codes[i]
|
||||
interface = codes.__cuda_array_interface__
|
||||
cat_codes.append(codes)
|
||||
else:
|
||||
interface = data[col].__cuda_array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
interfaces.append(interface)
|
||||
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
|
||||
return cat_codes, interfaces_str
|
||||
return interfaces_str
|
||||
|
||||
|
||||
def _transform_cudf_df(
|
||||
@@ -470,25 +503,57 @@ def _transform_cudf_df(
|
||||
except ImportError:
|
||||
from cudf.utils.dtypes import is_categorical_dtype
|
||||
|
||||
if _is_cudf_ser(data):
|
||||
dtypes = [data.dtype]
|
||||
else:
|
||||
dtypes = data.dtypes
|
||||
|
||||
if not all(
|
||||
dtype.name in _pandas_dtype_mapper
|
||||
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in dtypes
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
|
||||
# handle feature names
|
||||
if feature_names is None:
|
||||
if _is_cudf_ser(data):
|
||||
feature_names = [data.name]
|
||||
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
|
||||
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||
elif (
|
||||
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
|
||||
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
|
||||
# Unique to cuDF, no equivalence in pandas 1.3.3
|
||||
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
|
||||
):
|
||||
feature_names = list(map(str, data.columns))
|
||||
else:
|
||||
feature_names = data.columns.format()
|
||||
|
||||
# handle feature types
|
||||
if feature_types is None:
|
||||
feature_types = []
|
||||
if _is_cudf_ser(data):
|
||||
dtypes = [data.dtype]
|
||||
else:
|
||||
dtypes = data.dtypes
|
||||
for dtype in dtypes:
|
||||
if is_categorical_dtype(dtype) and enable_categorical:
|
||||
feature_types.append(CAT_T)
|
||||
else:
|
||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||
return data, feature_names, feature_types
|
||||
|
||||
# handle categorical data
|
||||
cat_codes = []
|
||||
if _is_cudf_ser(data):
|
||||
# unlike pandas, cuDF uses NA for missing data.
|
||||
if is_categorical_dtype(data.dtype) and enable_categorical:
|
||||
codes = data.cat.codes
|
||||
cat_codes.append(codes)
|
||||
else:
|
||||
for col in data:
|
||||
if is_categorical_dtype(data[col].dtype) and enable_categorical:
|
||||
codes = data[col].cat.codes
|
||||
cat_codes.append(codes)
|
||||
|
||||
return data, cat_codes, feature_names, feature_types
|
||||
|
||||
|
||||
def _from_cudf_df(
|
||||
@@ -499,10 +564,10 @@ def _from_cudf_df(
|
||||
feature_types: Optional[List[str]],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
||||
data, feature_names, feature_types = _transform_cudf_df(
|
||||
data, cat_codes, feature_names, feature_types = _transform_cudf_df(
|
||||
data, feature_names, feature_types, enable_categorical
|
||||
)
|
||||
_, interfaces_str = _cudf_array_interfaces(data)
|
||||
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||
handle = ctypes.c_void_p()
|
||||
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
|
||||
_check_call(
|
||||
@@ -707,8 +772,9 @@ def dispatch_data_backend(
|
||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||
feature_names, feature_types)
|
||||
if _is_pandas_series(data):
|
||||
return _from_pandas_series(data, missing, threads, feature_names,
|
||||
feature_types)
|
||||
return _from_pandas_series(
|
||||
data, missing, threads, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
if _is_cudf_df(data) or _is_cudf_ser(data):
|
||||
return _from_cudf_df(
|
||||
data, missing, threads, feature_names, feature_types, enable_categorical
|
||||
@@ -732,8 +798,9 @@ def dispatch_data_backend(
|
||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||
feature_names, feature_types)
|
||||
if _is_modin_series(data):
|
||||
return _from_pandas_series(data, missing, threads, feature_names,
|
||||
feature_types)
|
||||
return _from_pandas_series(
|
||||
data, missing, threads, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
if _has_array_protocol(data):
|
||||
array = np.asarray(data)
|
||||
return _from_numpy_array(array, missing, threads, feature_names, feature_types)
|
||||
@@ -866,8 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
|
||||
_meta_from_dt(data, name, dtype, handle)
|
||||
return
|
||||
if _is_modin_df(data):
|
||||
data, _, _ = _transform_pandas_df(
|
||||
data, False, meta=name, meta_type=dtype)
|
||||
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
|
||||
_meta_from_numpy(data, name, dtype, handle)
|
||||
return
|
||||
if _is_modin_series(data):
|
||||
@@ -917,30 +983,38 @@ def _proxy_transform(
|
||||
)
|
||||
if _is_cupy_array(data):
|
||||
data = _transform_cupy_array(data)
|
||||
return data, feature_names, feature_types
|
||||
return data, None, feature_names, feature_types
|
||||
if _is_dlpack(data):
|
||||
return _transform_dlpack(data), feature_names, feature_types
|
||||
return _transform_dlpack(data), None, feature_names, feature_types
|
||||
if _is_numpy_array(data):
|
||||
return data, feature_names, feature_types
|
||||
return data, None, feature_names, feature_types
|
||||
if _is_scipy_csr(data):
|
||||
return data, feature_names, feature_types
|
||||
return data, None, feature_names, feature_types
|
||||
if _is_pandas_df(data):
|
||||
arr, feature_names, feature_types = _transform_pandas_df(
|
||||
data, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
return arr, feature_names, feature_types
|
||||
return arr, None, feature_names, feature_types
|
||||
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
||||
|
||||
|
||||
def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None:
|
||||
def dispatch_proxy_set_data(
|
||||
proxy: _ProxyDMatrix,
|
||||
data: Any,
|
||||
cat_codes: Optional[list],
|
||||
allow_host: bool,
|
||||
) -> None:
|
||||
"""Dispatch for DeviceQuantileDMatrix."""
|
||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||
_check_data_shape(data)
|
||||
|
||||
if _is_cudf_df(data):
|
||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
||||
# pylint: disable=W0212
|
||||
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||
return
|
||||
if _is_cudf_ser(data):
|
||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
||||
# pylint: disable=W0212
|
||||
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||
return
|
||||
if _is_cupy_array(data):
|
||||
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
|
||||
|
||||
Reference in New Issue
Block a user