[backport] Handle missing values in dataframe with category dtype. (#7331) (#7413)

* Handle missing values in dataframe with category dtype. (#7331) * Replace -1 in pandas initializer. * Unify `IsValid` functor. * Mimic pandas data handling in cuDF glue code. * Check invalid categories. * Fix DDM sketching. * Fix pick error.
2021-11-10 21:24:46 +08:00 · 2021-11-10 21:24:46 +08:00 · 14c56f05da
commit 14c56f05da
parent 11f8b5cfcd
13 changed files with 308 additions and 110 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -386,7 +386,7 @@ class DataIter:  # pylint: disable=too-many-instance-attributes
            raise exc  # pylint: disable=raising-bad-type
    def __del__(self) -> None:
-        assert self._temporary_data is None, self._temporary_data
+        assert self._temporary_data is None
        assert self._exception is None
    def _reset_wrapper(self, this: None) -> None:  # pylint: disable=unused-argument
@ -410,19 +410,19 @@ class DataIter:  # pylint: disable=too-many-instance-attributes
            feature_names: Optional[List[str]] = None,
            feature_types: Optional[List[str]] = None,
            **kwargs: Any,
-        ):
+        ) -> None:
            from .data import dispatch_proxy_set_data
            from .data import _proxy_transform
-            transformed, feature_names, feature_types = _proxy_transform(
+            new, cat_codes, feature_names, feature_types = _proxy_transform(
                data,
                feature_names,
                feature_types,
                self._enable_categorical,
            )
            # Stage the data, meta info are copied inside C++ MetaInfo.
-            self._temporary_data = transformed
+            self._temporary_data = (new, cat_codes)
-            dispatch_proxy_set_data(self.proxy, transformed, self._allow_host)
+            dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
            self.proxy.set_info(
                feature_names=feature_names,
                feature_types=feature_types,
@ -1103,7 +1103,7 @@ class _ProxyDMatrix(DMatrix):
        self.handle = ctypes.c_void_p()
        _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
-    def _set_data_from_cuda_interface(self, data):
+    def _set_data_from_cuda_interface(self, data) -> None:
        """Set data from CUDA array interface."""
        interface = data.__cuda_array_interface__
        interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
@ -1111,11 +1111,11 @@ class _ProxyDMatrix(DMatrix):
            _LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
        )
-    def _set_data_from_cuda_columnar(self, data):
+    def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
        """Set data from CUDA columnar format."""
        from .data import _cudf_array_interfaces
-        _, interfaces_str = _cudf_array_interfaces(data)
+        interfaces_str = _cudf_array_interfaces(data, cat_codes)
        _check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
    def _set_data_from_array(self, data: np.ndarray):
@ -1986,13 +1986,6 @@ class Booster(object):
        preds = ctypes.POINTER(ctypes.c_float)()
        # once caching is supported, we can pass id(data) as cache id.
        try:
            import pandas as pd
            if isinstance(data, pd.DataFrame):
                data = data.values
        except ImportError:
            pass
        args = {
            "type": 0,
            "training": False,
@ -2027,7 +2020,20 @@ class Booster(object):
                    f"got {data.shape[1]}"
                )
        from .data import _is_pandas_df, _transform_pandas_df
        from .data import _array_interface
        if (
            _is_pandas_df(data)
            or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
        ):
            ft = self.feature_types
            if ft is None:
                enable_categorical = False
            else:
                enable_categorical = any(f == "c" for f in ft)
        if _is_pandas_df(data):
            data, _, _ = _transform_pandas_df(data, enable_categorical)
        if isinstance(data, np.ndarray):
            from .data import _ensure_np_dtype
            data, _ = _ensure_np_dtype(data, data.dtype)
@ -2080,9 +2086,11 @@ class Booster(object):
            )
            return _prediction_output(shape, dims, preds, True)
        if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
-            from .data import _cudf_array_interfaces
+            from .data import _cudf_array_interfaces, _transform_cudf_df
-
+            data, cat_codes, _, _ = _transform_cudf_df(
-            _, interfaces_str = _cudf_array_interfaces(data)
+                data, None, None, enable_categorical
            )
            interfaces_str = _cudf_array_interfaces(data, cat_codes)
            _check_call(
                _LIB.XGBoosterPredictFromCudaColumnar(
                    self.handle,
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -1,4 +1,4 @@
-# pylint: disable=too-many-arguments, too-many-branches
+# pylint: disable=too-many-arguments, too-many-branches, too-many-lines
 # pylint: disable=too-many-return-statements, import-error
 '''Data dispatching for DMatrix.'''
 import ctypes
@ -12,7 +12,7 @@ import numpy as np
 from .core import c_array, _LIB, _check_call, c_str
 from .core import _cuda_array_interface
 from .core import DataIter, _ProxyDMatrix, DMatrix
-from .compat import lazy_isinstance
+from .compat import lazy_isinstance, DataFrame
 c_bst_ulong = ctypes.c_uint64   # pylint: disable=invalid-name
@ -217,36 +217,48 @@ _pandas_dtype_mapper = {
 }
 def _invalid_dataframe_dtype(data) -> None:
    # pandas series has `dtypes` but it's just a single object
    # cudf series doesn't have `dtypes`.
    if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
        bad_fields = [
            str(data.columns[i])
            for i, dtype in enumerate(data.dtypes)
            if dtype.name not in _pandas_dtype_mapper
        ]
        err = " Invalid columns:" + ", ".join(bad_fields)
    else:
        err = ""
    msg = """DataFrame.dtypes for data must be int, float, bool or category.  When
 categorical type is supplied, DMatrix parameter `enable_categorical` must
 be set to `True`.""" + err
    raise ValueError(msg)
 def _transform_pandas_df(
-    data,
+    data: DataFrame,
    enable_categorical: bool,
    feature_names: Optional[List[str]] = None,
    feature_types: Optional[List[str]] = None,
-    meta=None,
+    meta: Optional[str] = None,
-    meta_type=None,
+    meta_type: Optional[str] = None,
-):
+) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
    import pandas as pd
    from pandas.api.types import is_sparse, is_categorical_dtype
-    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
+    if not all(
-               (is_categorical_dtype(dtype) and enable_categorical)
+        dtype.name in _pandas_dtype_mapper
-               for dtype in data.dtypes):
+        or is_sparse(dtype)
-        bad_fields = [
+        or (is_categorical_dtype(dtype) and enable_categorical)
-            str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
+        for dtype in data.dtypes
-            if dtype.name not in _pandas_dtype_mapper
+    ):
-        ]
+        _invalid_dataframe_dtype(data)
        msg = """DataFrame.dtypes for data must be int, float, bool or category.  When
                categorical type is supplied, DMatrix parameter `enable_categorical` must
                be set to `True`."""
        raise ValueError(msg + ', '.join(bad_fields))
    # handle feature names
    if feature_names is None and meta is None:
        if isinstance(data.columns, pd.MultiIndex):
-            feature_names = [
+            feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
                ' '.join([str(x) for x in i]) for i in data.columns
            ]
        elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
            feature_names = list(map(str, data.columns))
        else:
@ -263,21 +275,24 @@ def _transform_pandas_df(
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])
-    # handle categorical codes.
+    # handle category codes.
    transformed = pd.DataFrame()
    if enable_categorical:
        for i, dtype in enumerate(data.dtypes):
            if is_categorical_dtype(dtype):
-                transformed[data.columns[i]] = data[data.columns[i]].cat.codes
+                # pandas uses -1 as default missing value for categorical data
                transformed[data.columns[i]] = (
                    data[data.columns[i]]
                    .cat.codes.astype(np.float32)
                    .replace(-1.0, np.NaN)
                )
            else:
                transformed[data.columns[i]] = data[data.columns[i]]
    else:
        transformed = data
    if meta and len(data.columns) > 1:
-        raise ValueError(
+        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
            f"DataFrame for {meta} cannot have multiple columns"
        )
    dtype = meta_type if meta_type else np.float32
    arr = transformed.values
@ -287,7 +302,7 @@ def _transform_pandas_df(
 def _from_pandas_df(
-    data,
+    data: DataFrame,
    enable_categorical: bool,
    missing,
    nthread,
@ -300,6 +315,7 @@ def _from_pandas_df(
                             feature_types)
 def _is_pandas_series(data):
    try:
        import pandas as pd
@ -318,13 +334,26 @@ def _is_modin_series(data):
 def _from_pandas_series(
    data,
-    missing,
+    missing: float,
-    nthread,
+    nthread: int,
    enable_categorical: bool,
    feature_names: Optional[List[str]],
    feature_types: Optional[List[str]],
 ):
    from pandas.api.types import is_categorical_dtype
    if (data.dtype.name not in _pandas_dtype_mapper) and not (
        is_categorical_dtype(data.dtype) and enable_categorical
    ):
        _invalid_dataframe_dtype(data)
    if enable_categorical and is_categorical_dtype(data.dtype):
        data = data.cat.codes
    return _from_numpy_array(
-        data.values.astype("float"), missing, nthread, feature_names, feature_types
+        data.values.reshape(data.shape[0], 1).astype("float"),
        missing,
        nthread,
        feature_names,
        feature_types,
    )
@ -428,7 +457,7 @@ def _is_cudf_df(data):
    return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
-def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
+def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
    """Extract CuDF __cuda_array_interface__.  This is special as it returns a new list of
    data and a list of array interfaces.  The data is list of categorical codes that
    caller can safely ignore, but have to keep their reference alive until usage of array
@ -440,23 +469,27 @@ def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
    except ImportError:
        from cudf.utils.dtypes import is_categorical_dtype
    cat_codes = []
    interfaces = []
    if _is_cudf_ser(data):
-        interfaces.append(data.__cuda_array_interface__)
+        if is_categorical_dtype(data.dtype):
            interface = cat_codes[0].__cuda_array_interface__
        else:
            interface = data.__cuda_array_interface__
        if "mask" in interface:
            interface["mask"] = interface["mask"].__cuda_array_interface__
        interfaces.append(interface)
    else:
-        for col in data:
+        for i, col in enumerate(data):
            if is_categorical_dtype(data[col].dtype):
-                codes = data[col].cat.codes
+                codes = cat_codes[i]
                interface = codes.__cuda_array_interface__
                cat_codes.append(codes)
            else:
                interface = data[col].__cuda_array_interface__
            if "mask" in interface:
                interface["mask"] = interface["mask"].__cuda_array_interface__
            interfaces.append(interface)
    interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
-    return cat_codes, interfaces_str
+    return interfaces_str
 def _transform_cudf_df(
@ -470,25 +503,57 @@ def _transform_cudf_df(
    except ImportError:
        from cudf.utils.dtypes import is_categorical_dtype
    if _is_cudf_ser(data):
        dtypes = [data.dtype]
    else:
        dtypes = data.dtypes
    if not all(
        dtype.name in _pandas_dtype_mapper
        or (is_categorical_dtype(dtype) and enable_categorical)
        for dtype in dtypes
    ):
        _invalid_dataframe_dtype(data)
    # handle feature names
    if feature_names is None:
        if _is_cudf_ser(data):
            feature_names = [data.name]
        elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
            feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
        elif (
            lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
            or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
            # Unique to cuDF, no equivalence in pandas 1.3.3
            or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
        ):
            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()
    # handle feature types
    if feature_types is None:
        feature_types = []
        if _is_cudf_ser(data):
            dtypes = [data.dtype]
        else:
            dtypes = data.dtypes
        for dtype in dtypes:
            if is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append(CAT_T)
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])
-    return data, feature_names, feature_types
+
    # handle categorical data
    cat_codes = []
    if _is_cudf_ser(data):
        # unlike pandas, cuDF uses NA for missing data.
        if is_categorical_dtype(data.dtype) and enable_categorical:
            codes = data.cat.codes
            cat_codes.append(codes)
    else:
        for col in data:
            if is_categorical_dtype(data[col].dtype) and enable_categorical:
                codes = data[col].cat.codes
                cat_codes.append(codes)
    return data, cat_codes, feature_names, feature_types
 def _from_cudf_df(
@ -499,10 +564,10 @@ def _from_cudf_df(
    feature_types: Optional[List[str]],
    enable_categorical: bool,
 ) -> Tuple[ctypes.c_void_p, Any, Any]:
-    data, feature_names, feature_types = _transform_cudf_df(
+    data, cat_codes, feature_names, feature_types = _transform_cudf_df(
        data, feature_names, feature_types, enable_categorical
    )
-    _, interfaces_str = _cudf_array_interfaces(data)
+    interfaces_str = _cudf_array_interfaces(data, cat_codes)
    handle = ctypes.c_void_p()
    config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
    _check_call(
@ -707,8 +772,9 @@ def dispatch_data_backend(
        return _from_pandas_df(data, enable_categorical, missing, threads,
                               feature_names, feature_types)
    if _is_pandas_series(data):
-        return _from_pandas_series(data, missing, threads, feature_names,
+        return _from_pandas_series(
-                                   feature_types)
+            data, missing, threads, enable_categorical, feature_names, feature_types
        )
    if _is_cudf_df(data) or _is_cudf_ser(data):
        return _from_cudf_df(
            data, missing, threads, feature_names, feature_types, enable_categorical
@ -732,8 +798,9 @@ def dispatch_data_backend(
        return _from_pandas_df(data, enable_categorical, missing, threads,
                               feature_names, feature_types)
    if _is_modin_series(data):
-        return _from_pandas_series(data, missing, threads, feature_names,
+        return _from_pandas_series(
-                                   feature_types)
+            data, missing, threads, enable_categorical, feature_names, feature_types
        )
    if _has_array_protocol(data):
        array = np.asarray(data)
        return _from_numpy_array(array, missing, threads, feature_names, feature_types)
@ -866,8 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
        _meta_from_dt(data, name, dtype, handle)
        return
    if _is_modin_df(data):
-        data, _, _ = _transform_pandas_df(
+        data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
            data, False, meta=name, meta_type=dtype)
        _meta_from_numpy(data, name, dtype, handle)
        return
    if _is_modin_series(data):
@ -917,30 +983,38 @@ def _proxy_transform(
        )
    if _is_cupy_array(data):
        data = _transform_cupy_array(data)
-        return data, feature_names, feature_types
+        return data, None, feature_names, feature_types
    if _is_dlpack(data):
-        return _transform_dlpack(data), feature_names, feature_types
+        return _transform_dlpack(data), None, feature_names, feature_types
    if _is_numpy_array(data):
-        return data, feature_names, feature_types
+        return data, None, feature_names, feature_types
    if _is_scipy_csr(data):
-        return data, feature_names, feature_types
+        return data, None, feature_names, feature_types
    if _is_pandas_df(data):
        arr, feature_names, feature_types = _transform_pandas_df(
            data, enable_categorical, feature_names, feature_types
        )
-        return arr, feature_names, feature_types
+        return arr, None, feature_names, feature_types
    raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
-def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None:
+def dispatch_proxy_set_data(
    proxy: _ProxyDMatrix,
    data: Any,
    cat_codes: Optional[list],
    allow_host: bool,
 ) -> None:
    """Dispatch for DeviceQuantileDMatrix."""
    if not _is_cudf_ser(data) and not _is_pandas_series(data):
        _check_data_shape(data)
    if _is_cudf_df(data):
-        proxy._set_data_from_cuda_columnar(data)  # pylint: disable=W0212
+        # pylint: disable=W0212
        proxy._set_data_from_cuda_columnar(data, cat_codes)
        return
    if _is_cudf_ser(data):
-        proxy._set_data_from_cuda_columnar(data)  # pylint: disable=W0212
+        # pylint: disable=W0212
        proxy._set_data_from_cuda_columnar(data, cat_codes)
        return
    if _is_cupy_array(data):
        proxy._set_data_from_cuda_interface(data)  # pylint: disable=W0212
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@ -1,5 +1,5 @@
 /*!
- * Copyright 2020 by XGBoost Contributors
+ * Copyright 2020-2021 by XGBoost Contributors
 * \file categorical.h
 */
 #ifndef XGBOOST_COMMON_CATEGORICAL_H_
@ -42,6 +42,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
  return !s_cats.Check(cat);
 }
 inline void CheckCat(bst_cat_t cat) {
  CHECK_GE(cat, 0) << "Invalid categorical value detected.  Categorical value "
                      "should be non-negative.";
 }
 struct IsCatOp {
  XGBOOST_DEVICE bool operator()(FeatureType ft) {
    return ft == FeatureType::kCategorical;
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -133,6 +133,7 @@ void RemoveDuplicatedCategories(
    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
    dh::device_vector<Entry> *p_sorted_entries,
    dh::caching_device_vector<size_t> *p_column_sizes_scan) {
  info.feature_types.SetDevice(device);
  auto d_feature_types = info.feature_types.ConstDeviceSpan();
  CHECK(!d_feature_types.empty());
  auto &column_sizes_scan = *p_column_sizes_scan;
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@ -124,6 +124,11 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);
 void RemoveDuplicatedCategories(
    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
    dh::device_vector<Entry> *p_sorted_entries,
    dh::caching_device_vector<size_t> *p_column_sizes_scan);
 }  // namespace detail
 // Compute sketch on DMatrix.
@ -132,9 +137,10 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
                           size_t sketch_batch_num_elements = 0);
 template <typename AdapterBatch>
-void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
+void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
-                          size_t begin, size_t end, float missing,
+                          int device, size_t columns, size_t begin, size_t end,
-                          SketchContainer* sketch_container, int num_cuts) {
+                          float missing, SketchContainer *sketch_container,
                          int num_cuts) {
  // Copy current subset of valid elements into temporary storage and sort
  dh::device_vector<Entry> sorted_entries;
  dh::caching_device_vector<size_t> column_sizes_scan;
@ -142,6 +148,7 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
      thrust::make_counting_iterator(0llu),
      [=] __device__(size_t idx) { return batch.GetElement(idx); });
  HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
  cuts_ptr.SetDevice(device);
  detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
                                 columns, num_cuts, device,
                                 &cuts_ptr,
@ -151,8 +158,14 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
  thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
               sorted_entries.end(), detail::EntryCompareOp());
-  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
+  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
                                       &sorted_entries, &column_sizes_scan);
  }
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  auto const &h_cuts_ptr = cuts_ptr.HostVector();
  // Extract the cuts from all columns concurrently
  sketch_container->Push(dh::ToSpan(sorted_entries),
                         dh::ToSpan(column_sizes_scan), d_cuts_ptr,
@ -222,6 +235,12 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
  detail::SortByWeight(&temp_weights, &sorted_entries);
  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
                                       &sorted_entries, &column_sizes_scan);
  }
  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@ -274,8 +293,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
        device, num_cuts_per_feature, false);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
-      ProcessSlidingWindow(batch, device, num_cols,
+      ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
-                           begin, end, missing, sketch_container, num_cuts_per_feature);
+                           sketch_container, num_cuts_per_feature);
    }
  }
 }
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@ -21,6 +21,7 @@
 #include "array_interface.h"
 #include "../c_api/c_api_error.h"
 #include "../common/math.h"
 namespace xgboost {
 namespace data {
@ -80,6 +81,24 @@ struct COOTuple {
  float value{0};
 };
 struct IsValidFunctor {
  float missing;
  XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
  XGBOOST_DEVICE bool operator()(float value) const {
    return !(common::CheckNAN(value) || value == missing);
  }
  XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
    return !(common::CheckNAN(e.value) || e.value == missing);
  }
  XGBOOST_DEVICE bool operator()(const Entry& e) const {
    return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
  }
 };
 namespace detail {
 /**
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -987,18 +987,19 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
  // Second pass over batch, placing elements in correct position
  auto is_valid = data::IsValidFunctor{missing};
 #pragma omp parallel num_threads(nthread)
  {
    exec.Run([&]() {
      int tid = omp_get_thread_num();
-      size_t begin = tid*thread_size;
+      size_t begin = tid * thread_size;
-      size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
+      size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
      for (size_t i = begin; i < end; ++i) {
        auto line = batch.GetLine(i);
        for (auto j = 0ull; j < line.Size(); j++) {
          auto element = line.GetElement(j);
          const size_t key = (element.row_idx - base_rowid);
-          if (!common::CheckNAN(element.value) && element.value != missing) {
+          if (is_valid(element)) {
            builder.Push(key, Entry(element.column_idx, element.value), tid);
          }
        }
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@ -15,29 +15,6 @@
 namespace xgboost {
 namespace data {
 struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
  float missing;
  XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
  __device__ bool operator()(float value) const {
    return !(common::CheckNAN(value) || value == missing);
  }
  __device__ bool operator()(const data::COOTuple& e) const {
    if (common::CheckNAN(e.value) || e.value == missing) {
      return false;
    }
    return true;
  }
  __device__ bool operator()(const Entry& e) const {
    if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
      return false;
    }
    return true;
  }
 };
 class CudfAdapterBatch : public detail::NoMetaInfo {
  friend class CudfAdapter;
--- a/src/data/iterative_device_dmatrix.cu
+++ b/src/data/iterative_device_dmatrix.cu
@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
  if (batches == 1) {
    this->info_ = std::move(proxy->Info());
    this->info_.num_nonzero_ = nnz;
    CHECK_EQ(proxy->Info().labels_.Size(), 0);
  }
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -585,6 +585,7 @@ struct GPUHistMakerDevice {
      CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
          << "Categorical feature value too large.";
      auto cat = common::AsCat(candidate.split.fvalue);
      common::CheckCat(cat);
      std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
      LBitField32 cats_bits(split_cats);
      cats_bits.Set(cat);
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@ -392,6 +392,52 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
  EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
 }
 void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
                                  int32_t num_bins, bool weighted) {
  auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
  thrust::device_vector<float> x(h_x);
  auto adapter = AdapterFromData(x, n, 1);
  MetaInfo info;
  info.num_row_ = n;
  info.num_col_ = 1;
  info.feature_types.HostVector().push_back(FeatureType::kCategorical);
  if (weighted) {
    std::vector<float> weights(n, 0);
    SimpleLCG lcg;
    SimpleRealUniformDistribution<float> dist(0, 1);
    for (auto& v : weights) {
      v = dist(&lcg);
    }
    info.weights_.HostVector() = weights;
  }
  ASSERT_EQ(info.feature_types.Size(), 1);
  SketchContainer container(info.feature_types, num_bins, 1, n, 0);
  AdapterDeviceSketch(adapter.Value(), num_bins, info,
                      std::numeric_limits<float>::quiet_NaN(), &container);
  HistogramCuts cuts;
  container.MakeCuts(&cuts);
  thrust::sort(x.begin(), x.end());
  auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
  ASSERT_NE(n_uniques, x.size());
  ASSERT_EQ(cuts.TotalBins(), n_uniques);
  ASSERT_EQ(n_uniques, num_categories);
  auto& values = cuts.cut_values_.HostVector();
  ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
  auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
  ASSERT_TRUE(is_unique);
  x.resize(n_uniques);
  h_x.resize(n_uniques);
  thrust::copy(x.begin(), x.end(), h_x.begin());
  for (decltype(n_uniques) i = 0; i < n_uniques; ++i) {
    ASSERT_EQ(h_x[i], values[i]);
  }
 }
 TEST(HistUtil, AdapterDeviceSketchCategorical) {
  int categorical_sizes[] = {2, 6, 8, 12};
  int num_bins = 256;
@ -404,6 +450,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
      auto adapter = AdapterFromData(x_device, n, 1);
      ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
                          adapter.NumRows(), dmat.get());
      TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
      TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
    }
  }
 }
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@ -186,6 +186,37 @@ Arrow specification.'''
        assert len(Xy.feature_types) == X.shape[1]
        assert all(t == "c" for t in Xy.feature_types)
        # test missing value
        X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
        X["f0"] = X["f0"].astype("category")
        df, cat_codes, _, _ = xgb.data._transform_cudf_df(
            X, None, None, enable_categorical=True
        )
        for col in cat_codes:
            assert col.has_nulls
        y = [0, 1, 2]
        with pytest.raises(ValueError):
            xgb.DMatrix(X, y)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1
        with pytest.raises(ValueError):
            xgb.DeviceQuantileDMatrix(X, y)
        Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1
        X = X["f0"]
        with pytest.raises(ValueError):
            xgb.DMatrix(X, y)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1
@pytest.mark.skipif(**tm.no_cudf())
@pytest.mark.skipif(**tm.no_cupy())
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@ -138,9 +138,22 @@ class TestPandas:
            X, enable_categorical=True
        )
        assert np.issubdtype(transformed[:, 0].dtype, np.integer)
        assert transformed[:, 0].min() == 0
        # test missing value
        X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
        X["f0"] = X["f0"].astype("category")
        arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
        assert not np.any(arr == -1.0)
        X = X["f0"]
        with pytest.raises(ValueError):
            xgb.DMatrix(X, y)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        assert Xy.num_row() == 3
        assert Xy.num_col() == 1
    def test_pandas_sparse(self):
        import pandas as pd
        rows = 100