[backport] Handle missing values in dataframe with category dtype. (#7331) (#7413)

* Handle missing values in dataframe with category dtype. (#7331)

* Replace -1 in pandas initializer.
* Unify `IsValid` functor.
* Mimic pandas data handling in cuDF glue code.
* Check invalid categories.
* Fix DDM sketching.

* Fix pick error.
This commit is contained in:
Jiaming Yuan 2021-11-10 21:24:46 +08:00 committed by GitHub
parent 11f8b5cfcd
commit 14c56f05da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 308 additions and 110 deletions

View File

@ -386,7 +386,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
raise exc # pylint: disable=raising-bad-type raise exc # pylint: disable=raising-bad-type
def __del__(self) -> None: def __del__(self) -> None:
assert self._temporary_data is None, self._temporary_data assert self._temporary_data is None
assert self._exception is None assert self._exception is None
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
@ -410,19 +410,19 @@ class DataIter: # pylint: disable=too-many-instance-attributes
feature_names: Optional[List[str]] = None, feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[List[str]] = None,
**kwargs: Any, **kwargs: Any,
): ) -> None:
from .data import dispatch_proxy_set_data from .data import dispatch_proxy_set_data
from .data import _proxy_transform from .data import _proxy_transform
transformed, feature_names, feature_types = _proxy_transform( new, cat_codes, feature_names, feature_types = _proxy_transform(
data, data,
feature_names, feature_names,
feature_types, feature_types,
self._enable_categorical, self._enable_categorical,
) )
# Stage the data, meta info are copied inside C++ MetaInfo. # Stage the data, meta info are copied inside C++ MetaInfo.
self._temporary_data = transformed self._temporary_data = (new, cat_codes)
dispatch_proxy_set_data(self.proxy, transformed, self._allow_host) dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
self.proxy.set_info( self.proxy.set_info(
feature_names=feature_names, feature_names=feature_names,
feature_types=feature_types, feature_types=feature_types,
@ -1103,7 +1103,7 @@ class _ProxyDMatrix(DMatrix):
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle))) _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
def _set_data_from_cuda_interface(self, data): def _set_data_from_cuda_interface(self, data) -> None:
"""Set data from CUDA array interface.""" """Set data from CUDA array interface."""
interface = data.__cuda_array_interface__ interface = data.__cuda_array_interface__
interface_str = bytes(json.dumps(interface, indent=2), "utf-8") interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
@ -1111,11 +1111,11 @@ class _ProxyDMatrix(DMatrix):
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str) _LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
) )
def _set_data_from_cuda_columnar(self, data): def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
"""Set data from CUDA columnar format.""" """Set data from CUDA columnar format."""
from .data import _cudf_array_interfaces from .data import _cudf_array_interfaces
_, interfaces_str = _cudf_array_interfaces(data) interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str)) _check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
def _set_data_from_array(self, data: np.ndarray): def _set_data_from_array(self, data: np.ndarray):
@ -1986,13 +1986,6 @@ class Booster(object):
preds = ctypes.POINTER(ctypes.c_float)() preds = ctypes.POINTER(ctypes.c_float)()
# once caching is supported, we can pass id(data) as cache id. # once caching is supported, we can pass id(data) as cache id.
try:
import pandas as pd
if isinstance(data, pd.DataFrame):
data = data.values
except ImportError:
pass
args = { args = {
"type": 0, "type": 0,
"training": False, "training": False,
@ -2027,7 +2020,20 @@ class Booster(object):
f"got {data.shape[1]}" f"got {data.shape[1]}"
) )
from .data import _is_pandas_df, _transform_pandas_df
from .data import _array_interface from .data import _array_interface
if (
_is_pandas_df(data)
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
):
ft = self.feature_types
if ft is None:
enable_categorical = False
else:
enable_categorical = any(f == "c" for f in ft)
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, enable_categorical)
if isinstance(data, np.ndarray): if isinstance(data, np.ndarray):
from .data import _ensure_np_dtype from .data import _ensure_np_dtype
data, _ = _ensure_np_dtype(data, data.dtype) data, _ = _ensure_np_dtype(data, data.dtype)
@ -2080,9 +2086,11 @@ class Booster(object):
) )
return _prediction_output(shape, dims, preds, True) return _prediction_output(shape, dims, preds, True)
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"): if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
from .data import _cudf_array_interfaces from .data import _cudf_array_interfaces, _transform_cudf_df
data, cat_codes, _, _ = _transform_cudf_df(
_, interfaces_str = _cudf_array_interfaces(data) data, None, None, enable_categorical
)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call( _check_call(
_LIB.XGBoosterPredictFromCudaColumnar( _LIB.XGBoosterPredictFromCudaColumnar(
self.handle, self.handle,

View File

@ -1,4 +1,4 @@
# pylint: disable=too-many-arguments, too-many-branches # pylint: disable=too-many-arguments, too-many-branches, too-many-lines
# pylint: disable=too-many-return-statements, import-error # pylint: disable=too-many-return-statements, import-error
'''Data dispatching for DMatrix.''' '''Data dispatching for DMatrix.'''
import ctypes import ctypes
@ -12,7 +12,7 @@ import numpy as np
from .core import c_array, _LIB, _check_call, c_str from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix from .core import DataIter, _ProxyDMatrix, DMatrix
from .compat import lazy_isinstance from .compat import lazy_isinstance, DataFrame
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
@ -217,36 +217,48 @@ _pandas_dtype_mapper = {
} }
def _invalid_dataframe_dtype(data) -> None:
# pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`.
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
bad_fields = [
str(data.columns[i])
for i, dtype in enumerate(data.dtypes)
if dtype.name not in _pandas_dtype_mapper
]
err = " Invalid columns:" + ", ".join(bad_fields)
else:
err = ""
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`.""" + err
raise ValueError(msg)
def _transform_pandas_df( def _transform_pandas_df(
data, data: DataFrame,
enable_categorical: bool, enable_categorical: bool,
feature_names: Optional[List[str]] = None, feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[List[str]] = None,
meta=None, meta: Optional[str] = None,
meta_type=None, meta_type: Optional[str] = None,
): ) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
import pandas as pd import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype from pandas.api.types import is_sparse, is_categorical_dtype
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or if not all(
(is_categorical_dtype(dtype) and enable_categorical) dtype.name in _pandas_dtype_mapper
for dtype in data.dtypes): or is_sparse(dtype)
bad_fields = [ or (is_categorical_dtype(dtype) and enable_categorical)
str(data.columns[i]) for i, dtype in enumerate(data.dtypes) for dtype in data.dtypes
if dtype.name not in _pandas_dtype_mapper ):
] _invalid_dataframe_dtype(data)
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))
# handle feature names # handle feature names
if feature_names is None and meta is None: if feature_names is None and meta is None:
if isinstance(data.columns, pd.MultiIndex): if isinstance(data.columns, pd.MultiIndex):
feature_names = [ feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
' '.join([str(x) for x in i]) for i in data.columns
]
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)): elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
feature_names = list(map(str, data.columns)) feature_names = list(map(str, data.columns))
else: else:
@ -263,21 +275,24 @@ def _transform_pandas_df(
else: else:
feature_types.append(_pandas_dtype_mapper[dtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.name])
# handle categorical codes. # handle category codes.
transformed = pd.DataFrame() transformed = pd.DataFrame()
if enable_categorical: if enable_categorical:
for i, dtype in enumerate(data.dtypes): for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype): if is_categorical_dtype(dtype):
transformed[data.columns[i]] = data[data.columns[i]].cat.codes # pandas uses -1 as default missing value for categorical data
transformed[data.columns[i]] = (
data[data.columns[i]]
.cat.codes.astype(np.float32)
.replace(-1.0, np.NaN)
)
else: else:
transformed[data.columns[i]] = data[data.columns[i]] transformed[data.columns[i]] = data[data.columns[i]]
else: else:
transformed = data transformed = data
if meta and len(data.columns) > 1: if meta and len(data.columns) > 1:
raise ValueError( raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
f"DataFrame for {meta} cannot have multiple columns"
)
dtype = meta_type if meta_type else np.float32 dtype = meta_type if meta_type else np.float32
arr = transformed.values arr = transformed.values
@ -287,7 +302,7 @@ def _transform_pandas_df(
def _from_pandas_df( def _from_pandas_df(
data, data: DataFrame,
enable_categorical: bool, enable_categorical: bool,
missing, missing,
nthread, nthread,
@ -300,6 +315,7 @@ def _from_pandas_df(
feature_types) feature_types)
def _is_pandas_series(data): def _is_pandas_series(data):
try: try:
import pandas as pd import pandas as pd
@ -318,13 +334,26 @@ def _is_modin_series(data):
def _from_pandas_series( def _from_pandas_series(
data, data,
missing, missing: float,
nthread, nthread: int,
enable_categorical: bool,
feature_names: Optional[List[str]], feature_names: Optional[List[str]],
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
): ):
from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not (
is_categorical_dtype(data.dtype) and enable_categorical
):
_invalid_dataframe_dtype(data)
if enable_categorical and is_categorical_dtype(data.dtype):
data = data.cat.codes
return _from_numpy_array( return _from_numpy_array(
data.values.astype("float"), missing, nthread, feature_names, feature_types data.values.reshape(data.shape[0], 1).astype("float"),
missing,
nthread,
feature_names,
feature_types,
) )
@ -428,7 +457,7 @@ def _is_cudf_df(data):
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame) return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
def _cudf_array_interfaces(data) -> Tuple[list, bytes]: def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of """Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
data and a list of array interfaces. The data is list of categorical codes that data and a list of array interfaces. The data is list of categorical codes that
caller can safely ignore, but have to keep their reference alive until usage of array caller can safely ignore, but have to keep their reference alive until usage of array
@ -440,23 +469,27 @@ def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
except ImportError: except ImportError:
from cudf.utils.dtypes import is_categorical_dtype from cudf.utils.dtypes import is_categorical_dtype
cat_codes = []
interfaces = [] interfaces = []
if _is_cudf_ser(data): if _is_cudf_ser(data):
interfaces.append(data.__cuda_array_interface__) if is_categorical_dtype(data.dtype):
interface = cat_codes[0].__cuda_array_interface__
else: else:
for col in data: interface = data.__cuda_array_interface__
if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface)
else:
for i, col in enumerate(data):
if is_categorical_dtype(data[col].dtype): if is_categorical_dtype(data[col].dtype):
codes = data[col].cat.codes codes = cat_codes[i]
interface = codes.__cuda_array_interface__ interface = codes.__cuda_array_interface__
cat_codes.append(codes)
else: else:
interface = data[col].__cuda_array_interface__ interface = data[col].__cuda_array_interface__
if "mask" in interface: if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__ interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface) interfaces.append(interface)
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8") interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
return cat_codes, interfaces_str return interfaces_str
def _transform_cudf_df( def _transform_cudf_df(
@ -470,25 +503,57 @@ def _transform_cudf_df(
except ImportError: except ImportError:
from cudf.utils.dtypes import is_categorical_dtype from cudf.utils.dtypes import is_categorical_dtype
if _is_cudf_ser(data):
dtypes = [data.dtype]
else:
dtypes = data.dtypes
if not all(
dtype.name in _pandas_dtype_mapper
or (is_categorical_dtype(dtype) and enable_categorical)
for dtype in dtypes
):
_invalid_dataframe_dtype(data)
# handle feature names
if feature_names is None: if feature_names is None:
if _is_cudf_ser(data): if _is_cudf_ser(data):
feature_names = [data.name] feature_names = [data.name]
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"): elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
feature_names = [" ".join([str(x) for x in i]) for i in data.columns] feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
elif (
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
# Unique to cuDF, no equivalence in pandas 1.3.3
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
):
feature_names = list(map(str, data.columns))
else: else:
feature_names = data.columns.format() feature_names = data.columns.format()
# handle feature types
if feature_types is None: if feature_types is None:
feature_types = [] feature_types = []
if _is_cudf_ser(data):
dtypes = [data.dtype]
else:
dtypes = data.dtypes
for dtype in dtypes: for dtype in dtypes:
if is_categorical_dtype(dtype) and enable_categorical: if is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T) feature_types.append(CAT_T)
else: else:
feature_types.append(_pandas_dtype_mapper[dtype.name]) feature_types.append(_pandas_dtype_mapper[dtype.name])
return data, feature_names, feature_types
# handle categorical data
cat_codes = []
if _is_cudf_ser(data):
# unlike pandas, cuDF uses NA for missing data.
if is_categorical_dtype(data.dtype) and enable_categorical:
codes = data.cat.codes
cat_codes.append(codes)
else:
for col in data:
if is_categorical_dtype(data[col].dtype) and enable_categorical:
codes = data[col].cat.codes
cat_codes.append(codes)
return data, cat_codes, feature_names, feature_types
def _from_cudf_df( def _from_cudf_df(
@ -499,10 +564,10 @@ def _from_cudf_df(
feature_types: Optional[List[str]], feature_types: Optional[List[str]],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]: ) -> Tuple[ctypes.c_void_p, Any, Any]:
data, feature_names, feature_types = _transform_cudf_df( data, cat_codes, feature_names, feature_types = _transform_cudf_df(
data, feature_names, feature_types, enable_categorical data, feature_names, feature_types, enable_categorical
) )
_, interfaces_str = _cudf_array_interfaces(data) interfaces_str = _cudf_array_interfaces(data, cat_codes)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8") config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
_check_call( _check_call(
@ -707,8 +772,9 @@ def dispatch_data_backend(
return _from_pandas_df(data, enable_categorical, missing, threads, return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types) feature_names, feature_types)
if _is_pandas_series(data): if _is_pandas_series(data):
return _from_pandas_series(data, missing, threads, feature_names, return _from_pandas_series(
feature_types) data, missing, threads, enable_categorical, feature_names, feature_types
)
if _is_cudf_df(data) or _is_cudf_ser(data): if _is_cudf_df(data) or _is_cudf_ser(data):
return _from_cudf_df( return _from_cudf_df(
data, missing, threads, feature_names, feature_types, enable_categorical data, missing, threads, feature_names, feature_types, enable_categorical
@ -732,8 +798,9 @@ def dispatch_data_backend(
return _from_pandas_df(data, enable_categorical, missing, threads, return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types) feature_names, feature_types)
if _is_modin_series(data): if _is_modin_series(data):
return _from_pandas_series(data, missing, threads, feature_names, return _from_pandas_series(
feature_types) data, missing, threads, enable_categorical, feature_names, feature_types
)
if _has_array_protocol(data): if _has_array_protocol(data):
array = np.asarray(data) array = np.asarray(data)
return _from_numpy_array(array, missing, threads, feature_names, feature_types) return _from_numpy_array(array, missing, threads, feature_names, feature_types)
@ -866,8 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
_meta_from_dt(data, name, dtype, handle) _meta_from_dt(data, name, dtype, handle)
return return
if _is_modin_df(data): if _is_modin_df(data):
data, _, _ = _transform_pandas_df( data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
data, False, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
return return
if _is_modin_series(data): if _is_modin_series(data):
@ -917,30 +983,38 @@ def _proxy_transform(
) )
if _is_cupy_array(data): if _is_cupy_array(data):
data = _transform_cupy_array(data) data = _transform_cupy_array(data)
return data, feature_names, feature_types return data, None, feature_names, feature_types
if _is_dlpack(data): if _is_dlpack(data):
return _transform_dlpack(data), feature_names, feature_types return _transform_dlpack(data), None, feature_names, feature_types
if _is_numpy_array(data): if _is_numpy_array(data):
return data, feature_names, feature_types return data, None, feature_names, feature_types
if _is_scipy_csr(data): if _is_scipy_csr(data):
return data, feature_names, feature_types return data, None, feature_names, feature_types
if _is_pandas_df(data): if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df( arr, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types data, enable_categorical, feature_names, feature_types
) )
return arr, feature_names, feature_types return arr, None, feature_names, feature_types
raise TypeError("Value type is not supported for data iterator:" + str(type(data))) raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None: def dispatch_proxy_set_data(
proxy: _ProxyDMatrix,
data: Any,
cat_codes: Optional[list],
allow_host: bool,
) -> None:
"""Dispatch for DeviceQuantileDMatrix.""" """Dispatch for DeviceQuantileDMatrix."""
if not _is_cudf_ser(data) and not _is_pandas_series(data): if not _is_cudf_ser(data) and not _is_pandas_series(data):
_check_data_shape(data) _check_data_shape(data)
if _is_cudf_df(data): if _is_cudf_df(data):
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212 # pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes)
return return
if _is_cudf_ser(data): if _is_cudf_ser(data):
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212 # pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes)
return return
if _is_cupy_array(data): if _is_cupy_array(data):
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212 proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2020 by XGBoost Contributors * Copyright 2020-2021 by XGBoost Contributors
* \file categorical.h * \file categorical.h
*/ */
#ifndef XGBOOST_COMMON_CATEGORICAL_H_ #ifndef XGBOOST_COMMON_CATEGORICAL_H_
@ -42,6 +42,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
return !s_cats.Check(cat); return !s_cats.Check(cat);
} }
inline void CheckCat(bst_cat_t cat) {
CHECK_GE(cat, 0) << "Invalid categorical value detected. Categorical value "
"should be non-negative.";
}
struct IsCatOp { struct IsCatOp {
XGBOOST_DEVICE bool operator()(FeatureType ft) { XGBOOST_DEVICE bool operator()(FeatureType ft) {
return ft == FeatureType::kCategorical; return ft == FeatureType::kCategorical;

View File

@ -133,6 +133,7 @@ void RemoveDuplicatedCategories(
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr, int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry> *p_sorted_entries, dh::device_vector<Entry> *p_sorted_entries,
dh::caching_device_vector<size_t> *p_column_sizes_scan) { dh::caching_device_vector<size_t> *p_column_sizes_scan) {
info.feature_types.SetDevice(device);
auto d_feature_types = info.feature_types.ConstDeviceSpan(); auto d_feature_types = info.feature_types.ConstDeviceSpan();
CHECK(!d_feature_types.empty()); CHECK(!d_feature_types.empty());
auto &column_sizes_scan = *p_column_sizes_scan; auto &column_sizes_scan = *p_column_sizes_scan;

View File

@ -124,6 +124,11 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
void SortByWeight(dh::device_vector<float>* weights, void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries); dh::device_vector<Entry>* sorted_entries);
void RemoveDuplicatedCategories(
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry> *p_sorted_entries,
dh::caching_device_vector<size_t> *p_column_sizes_scan);
} // namespace detail } // namespace detail
// Compute sketch on DMatrix. // Compute sketch on DMatrix.
@ -132,9 +137,10 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
size_t sketch_batch_num_elements = 0); size_t sketch_batch_num_elements = 0);
template <typename AdapterBatch> template <typename AdapterBatch>
void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns, void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
size_t begin, size_t end, float missing, int device, size_t columns, size_t begin, size_t end,
SketchContainer* sketch_container, int num_cuts) { float missing, SketchContainer *sketch_container,
int num_cuts) {
// Copy current subset of valid elements into temporary storage and sort // Copy current subset of valid elements into temporary storage and sort
dh::device_vector<Entry> sorted_entries; dh::device_vector<Entry> sorted_entries;
dh::caching_device_vector<size_t> column_sizes_scan; dh::caching_device_vector<size_t> column_sizes_scan;
@ -142,6 +148,7 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
thrust::make_counting_iterator(0llu), thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); }); [=] __device__(size_t idx) { return batch.GetElement(idx); });
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr; HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
cuts_ptr.SetDevice(device);
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing, detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
columns, num_cuts, device, columns, num_cuts, device,
&cuts_ptr, &cuts_ptr,
@ -151,8 +158,14 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(), thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
sorted_entries.end(), detail::EntryCompareOp()); sorted_entries.end(), detail::EntryCompareOp());
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector(); if (sketch_container->HasCategorical()) {
auto d_cuts_ptr = cuts_ptr.DeviceSpan(); auto d_cuts_ptr = cuts_ptr.DeviceSpan();
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
&sorted_entries, &column_sizes_scan);
}
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
auto const &h_cuts_ptr = cuts_ptr.HostVector();
// Extract the cuts from all columns concurrently // Extract the cuts from all columns concurrently
sketch_container->Push(dh::ToSpan(sorted_entries), sketch_container->Push(dh::ToSpan(sorted_entries),
dh::ToSpan(column_sizes_scan), d_cuts_ptr, dh::ToSpan(column_sizes_scan), d_cuts_ptr,
@ -222,6 +235,12 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
detail::SortByWeight(&temp_weights, &sorted_entries); detail::SortByWeight(&temp_weights, &sorted_entries);
if (sketch_container->HasCategorical()) {
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
&sorted_entries, &column_sizes_scan);
}
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector(); auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
auto d_cuts_ptr = cuts_ptr.DeviceSpan(); auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@ -274,8 +293,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
device, num_cuts_per_feature, false); device, num_cuts_per_feature, false);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) { for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements)); size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
ProcessSlidingWindow(batch, device, num_cols, ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
begin, end, missing, sketch_container, num_cuts_per_feature); sketch_container, num_cuts_per_feature);
} }
} }
} }

View File

@ -21,6 +21,7 @@
#include "array_interface.h" #include "array_interface.h"
#include "../c_api/c_api_error.h" #include "../c_api/c_api_error.h"
#include "../common/math.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
@ -80,6 +81,24 @@ struct COOTuple {
float value{0}; float value{0};
}; };
struct IsValidFunctor {
float missing;
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
XGBOOST_DEVICE bool operator()(float value) const {
return !(common::CheckNAN(value) || value == missing);
}
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
return !(common::CheckNAN(e.value) || e.value == missing);
}
XGBOOST_DEVICE bool operator()(const Entry& e) const {
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
}
};
namespace detail { namespace detail {
/** /**

View File

@ -987,18 +987,19 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
// Second pass over batch, placing elements in correct position // Second pass over batch, placing elements in correct position
auto is_valid = data::IsValidFunctor{missing};
#pragma omp parallel num_threads(nthread) #pragma omp parallel num_threads(nthread)
{ {
exec.Run([&]() { exec.Run([&]() {
int tid = omp_get_thread_num(); int tid = omp_get_thread_num();
size_t begin = tid*thread_size; size_t begin = tid * thread_size;
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size; size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
for (size_t i = begin; i < end; ++i) { for (size_t i = begin; i < end; ++i) {
auto line = batch.GetLine(i); auto line = batch.GetLine(i);
for (auto j = 0ull; j < line.Size(); j++) { for (auto j = 0ull; j < line.Size(); j++) {
auto element = line.GetElement(j); auto element = line.GetElement(j);
const size_t key = (element.row_idx - base_rowid); const size_t key = (element.row_idx - base_rowid);
if (!common::CheckNAN(element.value) && element.value != missing) { if (is_valid(element)) {
builder.Push(key, Entry(element.column_idx, element.value), tid); builder.Push(key, Entry(element.column_idx, element.value), tid);
} }
} }

View File

@ -15,29 +15,6 @@
namespace xgboost { namespace xgboost {
namespace data { namespace data {
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
float missing;
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
__device__ bool operator()(float value) const {
return !(common::CheckNAN(value) || value == missing);
}
__device__ bool operator()(const data::COOTuple& e) const {
if (common::CheckNAN(e.value) || e.value == missing) {
return false;
}
return true;
}
__device__ bool operator()(const Entry& e) const {
if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
return false;
}
return true;
}
};
class CudfAdapterBatch : public detail::NoMetaInfo { class CudfAdapterBatch : public detail::NoMetaInfo {
friend class CudfAdapter; friend class CudfAdapter;

View File

@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
if (batches == 1) { if (batches == 1) {
this->info_ = std::move(proxy->Info()); this->info_ = std::move(proxy->Info());
this->info_.num_nonzero_ = nnz;
CHECK_EQ(proxy->Info().labels_.Size(), 0); CHECK_EQ(proxy->Info().labels_.Size(), 0);
} }

View File

@ -585,6 +585,7 @@ struct GPUHistMakerDevice {
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max()) CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
<< "Categorical feature value too large."; << "Categorical feature value too large.";
auto cat = common::AsCat(candidate.split.fvalue); auto cat = common::AsCat(candidate.split.fvalue);
common::CheckCat(cat);
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0); std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
LBitField32 cats_bits(split_cats); LBitField32 cats_bits(split_cats);
cats_bits.Set(cat); cats_bits.Set(cat);

View File

@ -392,6 +392,52 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required); EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
} }
void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
int32_t num_bins, bool weighted) {
auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
thrust::device_vector<float> x(h_x);
auto adapter = AdapterFromData(x, n, 1);
MetaInfo info;
info.num_row_ = n;
info.num_col_ = 1;
info.feature_types.HostVector().push_back(FeatureType::kCategorical);
if (weighted) {
std::vector<float> weights(n, 0);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(0, 1);
for (auto& v : weights) {
v = dist(&lcg);
}
info.weights_.HostVector() = weights;
}
ASSERT_EQ(info.feature_types.Size(), 1);
SketchContainer container(info.feature_types, num_bins, 1, n, 0);
AdapterDeviceSketch(adapter.Value(), num_bins, info,
std::numeric_limits<float>::quiet_NaN(), &container);
HistogramCuts cuts;
container.MakeCuts(&cuts);
thrust::sort(x.begin(), x.end());
auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
ASSERT_NE(n_uniques, x.size());
ASSERT_EQ(cuts.TotalBins(), n_uniques);
ASSERT_EQ(n_uniques, num_categories);
auto& values = cuts.cut_values_.HostVector();
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
ASSERT_TRUE(is_unique);
x.resize(n_uniques);
h_x.resize(n_uniques);
thrust::copy(x.begin(), x.end(), h_x.begin());
for (decltype(n_uniques) i = 0; i < n_uniques; ++i) {
ASSERT_EQ(h_x[i], values[i]);
}
}
TEST(HistUtil, AdapterDeviceSketchCategorical) { TEST(HistUtil, AdapterDeviceSketchCategorical) {
int categorical_sizes[] = {2, 6, 8, 12}; int categorical_sizes[] = {2, 6, 8, 12};
int num_bins = 256; int num_bins = 256;
@ -404,6 +450,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
auto adapter = AdapterFromData(x_device, n, 1); auto adapter = AdapterFromData(x_device, n, 1);
ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(), ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
adapter.NumRows(), dmat.get()); adapter.NumRows(), dmat.get());
TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
} }
} }
} }

View File

@ -186,6 +186,37 @@ Arrow specification.'''
assert len(Xy.feature_types) == X.shape[1] assert len(Xy.feature_types) == X.shape[1]
assert all(t == "c" for t in Xy.feature_types) assert all(t == "c" for t in Xy.feature_types)
# test missing value
X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
X["f0"] = X["f0"].astype("category")
df, cat_codes, _, _ = xgb.data._transform_cudf_df(
X, None, None, enable_categorical=True
)
for col in cat_codes:
assert col.has_nulls
y = [0, 1, 2]
with pytest.raises(ValueError):
xgb.DMatrix(X, y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
with pytest.raises(ValueError):
xgb.DeviceQuantileDMatrix(X, y)
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
X = X["f0"]
with pytest.raises(ValueError):
xgb.DMatrix(X, y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
@pytest.mark.skipif(**tm.no_cudf()) @pytest.mark.skipif(**tm.no_cudf())
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())

View File

@ -138,9 +138,22 @@ class TestPandas:
X, enable_categorical=True X, enable_categorical=True
) )
assert np.issubdtype(transformed[:, 0].dtype, np.integer)
assert transformed[:, 0].min() == 0 assert transformed[:, 0].min() == 0
# test missing value
X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
X["f0"] = X["f0"].astype("category")
arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
assert not np.any(arr == -1.0)
X = X["f0"]
with pytest.raises(ValueError):
xgb.DMatrix(X, y)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.num_row() == 3
assert Xy.num_col() == 1
def test_pandas_sparse(self): def test_pandas_sparse(self):
import pandas as pd import pandas as pd
rows = 100 rows = 100