[backport] Handle missing values in dataframe with category dtype. (#7331) (#7413)

* Handle missing values in dataframe with category dtype. (#7331) * Replace -1 in pandas initializer. * Unify `IsValid` functor. * Mimic pandas data handling in cuDF glue code. * Check invalid categories. * Fix DDM sketching. * Fix pick error.
2021-11-10 21:24:46 +08:00
parent 11f8b5cfcd
commit 14c56f05da
13 changed files with 308 additions and 110 deletions
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -386,7 +386,7 @@ class DataIter:  # pylint: disable=too-many-instance-attributes
            raise exc  # pylint: disable=raising-bad-type

    def __del__(self) -> None:
-        assert self._temporary_data is None, self._temporary_data
+        assert self._temporary_data is None
        assert self._exception is None

    def _reset_wrapper(self, this: None) -> None:  # pylint: disable=unused-argument
@@ -410,19 +410,19 @@ class DataIter:  # pylint: disable=too-many-instance-attributes
            feature_names: Optional[List[str]] = None,
            feature_types: Optional[List[str]] = None,
            **kwargs: Any,
-        ):
+        ) -> None:
            from .data import dispatch_proxy_set_data
            from .data import _proxy_transform

-            transformed, feature_names, feature_types = _proxy_transform(
+            new, cat_codes, feature_names, feature_types = _proxy_transform(
                data,
                feature_names,
                feature_types,
                self._enable_categorical,
            )
            # Stage the data, meta info are copied inside C++ MetaInfo.
-            self._temporary_data = transformed
-            dispatch_proxy_set_data(self.proxy, transformed, self._allow_host)
+            self._temporary_data = (new, cat_codes)
+            dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
            self.proxy.set_info(
                feature_names=feature_names,
                feature_types=feature_types,
@@ -1103,7 +1103,7 @@ class _ProxyDMatrix(DMatrix):
        self.handle = ctypes.c_void_p()
        _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))

-    def _set_data_from_cuda_interface(self, data):
+    def _set_data_from_cuda_interface(self, data) -> None:
        """Set data from CUDA array interface."""
        interface = data.__cuda_array_interface__
        interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
@@ -1111,11 +1111,11 @@ class _ProxyDMatrix(DMatrix):
            _LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
        )

-    def _set_data_from_cuda_columnar(self, data):
+    def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
        """Set data from CUDA columnar format."""
        from .data import _cudf_array_interfaces

-        _, interfaces_str = _cudf_array_interfaces(data)
+        interfaces_str = _cudf_array_interfaces(data, cat_codes)
        _check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))

    def _set_data_from_array(self, data: np.ndarray):
@@ -1986,13 +1986,6 @@ class Booster(object):
        preds = ctypes.POINTER(ctypes.c_float)()

        # once caching is supported, we can pass id(data) as cache id.
-        try:
-            import pandas as pd
-
-            if isinstance(data, pd.DataFrame):
-                data = data.values
-        except ImportError:
-            pass
        args = {
            "type": 0,
            "training": False,
@@ -2027,7 +2020,20 @@ class Booster(object):
                    f"got {data.shape[1]}"
                )

+        from .data import _is_pandas_df, _transform_pandas_df
        from .data import _array_interface
+        if (
+            _is_pandas_df(data)
+            or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
+        ):
+            ft = self.feature_types
+            if ft is None:
+                enable_categorical = False
+            else:
+                enable_categorical = any(f == "c" for f in ft)
+        if _is_pandas_df(data):
+            data, _, _ = _transform_pandas_df(data, enable_categorical)
+
        if isinstance(data, np.ndarray):
            from .data import _ensure_np_dtype
            data, _ = _ensure_np_dtype(data, data.dtype)
@@ -2080,9 +2086,11 @@ class Booster(object):
            )
            return _prediction_output(shape, dims, preds, True)
        if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
-            from .data import _cudf_array_interfaces
-
-            _, interfaces_str = _cudf_array_interfaces(data)
+            from .data import _cudf_array_interfaces, _transform_cudf_df
+            data, cat_codes, _, _ = _transform_cudf_df(
+                data, None, None, enable_categorical
+            )
+            interfaces_str = _cudf_array_interfaces(data, cat_codes)
            _check_call(
                _LIB.XGBoosterPredictFromCudaColumnar(
                    self.handle,
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-arguments, too-many-branches
+# pylint: disable=too-many-arguments, too-many-branches, too-many-lines
 # pylint: disable=too-many-return-statements, import-error
 '''Data dispatching for DMatrix.'''
 import ctypes
@@ -12,7 +12,7 @@ import numpy as np
 from .core import c_array, _LIB, _check_call, c_str
 from .core import _cuda_array_interface
 from .core import DataIter, _ProxyDMatrix, DMatrix
-from .compat import lazy_isinstance
+from .compat import lazy_isinstance, DataFrame

 c_bst_ulong = ctypes.c_uint64   # pylint: disable=invalid-name

@@ -217,36 +217,48 @@ _pandas_dtype_mapper = {
 }


+def _invalid_dataframe_dtype(data) -> None:
+    # pandas series has `dtypes` but it's just a single object
+    # cudf series doesn't have `dtypes`.
+    if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
+        bad_fields = [
+            str(data.columns[i])
+            for i, dtype in enumerate(data.dtypes)
+            if dtype.name not in _pandas_dtype_mapper
+        ]
+        err = " Invalid columns:" + ", ".join(bad_fields)
+    else:
+        err = ""
+
+    msg = """DataFrame.dtypes for data must be int, float, bool or category.  When
+categorical type is supplied, DMatrix parameter `enable_categorical` must
+be set to `True`.""" + err
+    raise ValueError(msg)
+
+
 def _transform_pandas_df(
-    data,
+    data: DataFrame,
    enable_categorical: bool,
    feature_names: Optional[List[str]] = None,
    feature_types: Optional[List[str]] = None,
-    meta=None,
-    meta_type=None,
-):
+    meta: Optional[str] = None,
+    meta_type: Optional[str] = None,
+) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
    import pandas as pd
    from pandas.api.types import is_sparse, is_categorical_dtype

-    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
-               (is_categorical_dtype(dtype) and enable_categorical)
-               for dtype in data.dtypes):
-        bad_fields = [
-            str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
-            if dtype.name not in _pandas_dtype_mapper
-        ]
-
-        msg = """DataFrame.dtypes for data must be int, float, bool or category.  When
-                categorical type is supplied, DMatrix parameter `enable_categorical` must
-                be set to `True`."""
-        raise ValueError(msg + ', '.join(bad_fields))
+    if not all(
+        dtype.name in _pandas_dtype_mapper
+        or is_sparse(dtype)
+        or (is_categorical_dtype(dtype) and enable_categorical)
+        for dtype in data.dtypes
+    ):
+        _invalid_dataframe_dtype(data)

    # handle feature names
    if feature_names is None and meta is None:
        if isinstance(data.columns, pd.MultiIndex):
-            feature_names = [
-                ' '.join([str(x) for x in i]) for i in data.columns
-            ]
+            feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
        elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
            feature_names = list(map(str, data.columns))
        else:
@@ -263,21 +275,24 @@ def _transform_pandas_df(
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])

-    # handle categorical codes.
+    # handle category codes.
    transformed = pd.DataFrame()
    if enable_categorical:
        for i, dtype in enumerate(data.dtypes):
            if is_categorical_dtype(dtype):
-                transformed[data.columns[i]] = data[data.columns[i]].cat.codes
+                # pandas uses -1 as default missing value for categorical data
+                transformed[data.columns[i]] = (
+                    data[data.columns[i]]
+                    .cat.codes.astype(np.float32)
+                    .replace(-1.0, np.NaN)
+                )
            else:
                transformed[data.columns[i]] = data[data.columns[i]]
    else:
        transformed = data

    if meta and len(data.columns) > 1:
-        raise ValueError(
-            f"DataFrame for {meta} cannot have multiple columns"
-        )
+        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

    dtype = meta_type if meta_type else np.float32
    arr = transformed.values
@@ -287,7 +302,7 @@ def _transform_pandas_df(


 def _from_pandas_df(
-    data,
+    data: DataFrame,
    enable_categorical: bool,
    missing,
    nthread,
@@ -300,6 +315,7 @@ def _from_pandas_df(
                             feature_types)


+
 def _is_pandas_series(data):
    try:
        import pandas as pd
@@ -318,13 +334,26 @@ def _is_modin_series(data):

 def _from_pandas_series(
    data,
-    missing,
-    nthread,
+    missing: float,
+    nthread: int,
+    enable_categorical: bool,
    feature_names: Optional[List[str]],
    feature_types: Optional[List[str]],
 ):
+    from pandas.api.types import is_categorical_dtype
+
+    if (data.dtype.name not in _pandas_dtype_mapper) and not (
+        is_categorical_dtype(data.dtype) and enable_categorical
+    ):
+        _invalid_dataframe_dtype(data)
+    if enable_categorical and is_categorical_dtype(data.dtype):
+        data = data.cat.codes
    return _from_numpy_array(
-        data.values.astype("float"), missing, nthread, feature_names, feature_types
+        data.values.reshape(data.shape[0], 1).astype("float"),
+        missing,
+        nthread,
+        feature_names,
+        feature_types,
    )


@@ -428,7 +457,7 @@ def _is_cudf_df(data):
    return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)


-def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
+def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
    """Extract CuDF __cuda_array_interface__.  This is special as it returns a new list of
    data and a list of array interfaces.  The data is list of categorical codes that
    caller can safely ignore, but have to keep their reference alive until usage of array
@@ -440,23 +469,27 @@ def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
    except ImportError:
        from cudf.utils.dtypes import is_categorical_dtype

-    cat_codes = []
    interfaces = []
    if _is_cudf_ser(data):
-        interfaces.append(data.__cuda_array_interface__)
+        if is_categorical_dtype(data.dtype):
+            interface = cat_codes[0].__cuda_array_interface__
+        else:
+            interface = data.__cuda_array_interface__
+        if "mask" in interface:
+            interface["mask"] = interface["mask"].__cuda_array_interface__
+        interfaces.append(interface)
    else:
-        for col in data:
+        for i, col in enumerate(data):
            if is_categorical_dtype(data[col].dtype):
-                codes = data[col].cat.codes
+                codes = cat_codes[i]
                interface = codes.__cuda_array_interface__
-                cat_codes.append(codes)
            else:
                interface = data[col].__cuda_array_interface__
            if "mask" in interface:
                interface["mask"] = interface["mask"].__cuda_array_interface__
            interfaces.append(interface)
    interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
-    return cat_codes, interfaces_str
+    return interfaces_str


 def _transform_cudf_df(
@@ -470,25 +503,57 @@ def _transform_cudf_df(
    except ImportError:
        from cudf.utils.dtypes import is_categorical_dtype

+    if _is_cudf_ser(data):
+        dtypes = [data.dtype]
+    else:
+        dtypes = data.dtypes
+
+    if not all(
+        dtype.name in _pandas_dtype_mapper
+        or (is_categorical_dtype(dtype) and enable_categorical)
+        for dtype in dtypes
+    ):
+        _invalid_dataframe_dtype(data)
+
+    # handle feature names
    if feature_names is None:
        if _is_cudf_ser(data):
            feature_names = [data.name]
        elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
            feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
+        elif (
+            lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
+            or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
+            # Unique to cuDF, no equivalence in pandas 1.3.3
+            or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
+        ):
+            feature_names = list(map(str, data.columns))
        else:
            feature_names = data.columns.format()
+
+    # handle feature types
    if feature_types is None:
        feature_types = []
-        if _is_cudf_ser(data):
-            dtypes = [data.dtype]
-        else:
-            dtypes = data.dtypes
        for dtype in dtypes:
            if is_categorical_dtype(dtype) and enable_categorical:
                feature_types.append(CAT_T)
            else:
                feature_types.append(_pandas_dtype_mapper[dtype.name])
-    return data, feature_names, feature_types
+
+    # handle categorical data
+    cat_codes = []
+    if _is_cudf_ser(data):
+        # unlike pandas, cuDF uses NA for missing data.
+        if is_categorical_dtype(data.dtype) and enable_categorical:
+            codes = data.cat.codes
+            cat_codes.append(codes)
+    else:
+        for col in data:
+            if is_categorical_dtype(data[col].dtype) and enable_categorical:
+                codes = data[col].cat.codes
+                cat_codes.append(codes)
+
+    return data, cat_codes, feature_names, feature_types


 def _from_cudf_df(
@@ -499,10 +564,10 @@ def _from_cudf_df(
    feature_types: Optional[List[str]],
    enable_categorical: bool,
 ) -> Tuple[ctypes.c_void_p, Any, Any]:
-    data, feature_names, feature_types = _transform_cudf_df(
+    data, cat_codes, feature_names, feature_types = _transform_cudf_df(
        data, feature_names, feature_types, enable_categorical
    )
-    _, interfaces_str = _cudf_array_interfaces(data)
+    interfaces_str = _cudf_array_interfaces(data, cat_codes)
    handle = ctypes.c_void_p()
    config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
    _check_call(
@@ -707,8 +772,9 @@ def dispatch_data_backend(
        return _from_pandas_df(data, enable_categorical, missing, threads,
                               feature_names, feature_types)
    if _is_pandas_series(data):
-        return _from_pandas_series(data, missing, threads, feature_names,
-                                   feature_types)
+        return _from_pandas_series(
+            data, missing, threads, enable_categorical, feature_names, feature_types
+        )
    if _is_cudf_df(data) or _is_cudf_ser(data):
        return _from_cudf_df(
            data, missing, threads, feature_names, feature_types, enable_categorical
@@ -732,8 +798,9 @@ def dispatch_data_backend(
        return _from_pandas_df(data, enable_categorical, missing, threads,
                               feature_names, feature_types)
    if _is_modin_series(data):
-        return _from_pandas_series(data, missing, threads, feature_names,
-                                   feature_types)
+        return _from_pandas_series(
+            data, missing, threads, enable_categorical, feature_names, feature_types
+        )
    if _has_array_protocol(data):
        array = np.asarray(data)
        return _from_numpy_array(array, missing, threads, feature_names, feature_types)
@@ -866,8 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
        _meta_from_dt(data, name, dtype, handle)
        return
    if _is_modin_df(data):
-        data, _, _ = _transform_pandas_df(
-            data, False, meta=name, meta_type=dtype)
+        data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
        _meta_from_numpy(data, name, dtype, handle)
        return
    if _is_modin_series(data):
@@ -917,30 +983,38 @@ def _proxy_transform(
        )
    if _is_cupy_array(data):
        data = _transform_cupy_array(data)
-        return data, feature_names, feature_types
+        return data, None, feature_names, feature_types
    if _is_dlpack(data):
-        return _transform_dlpack(data), feature_names, feature_types
+        return _transform_dlpack(data), None, feature_names, feature_types
    if _is_numpy_array(data):
-        return data, feature_names, feature_types
+        return data, None, feature_names, feature_types
    if _is_scipy_csr(data):
-        return data, feature_names, feature_types
+        return data, None, feature_names, feature_types
    if _is_pandas_df(data):
        arr, feature_names, feature_types = _transform_pandas_df(
            data, enable_categorical, feature_names, feature_types
        )
-        return arr, feature_names, feature_types
+        return arr, None, feature_names, feature_types
    raise TypeError("Value type is not supported for data iterator:" + str(type(data)))


-def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None:
+def dispatch_proxy_set_data(
+    proxy: _ProxyDMatrix,
+    data: Any,
+    cat_codes: Optional[list],
+    allow_host: bool,
+) -> None:
    """Dispatch for DeviceQuantileDMatrix."""
    if not _is_cudf_ser(data) and not _is_pandas_series(data):
        _check_data_shape(data)
+
    if _is_cudf_df(data):
-        proxy._set_data_from_cuda_columnar(data)  # pylint: disable=W0212
+        # pylint: disable=W0212
+        proxy._set_data_from_cuda_columnar(data, cat_codes)
        return
    if _is_cudf_ser(data):
-        proxy._set_data_from_cuda_columnar(data)  # pylint: disable=W0212
+        # pylint: disable=W0212
+        proxy._set_data_from_cuda_columnar(data, cat_codes)
        return
    if _is_cupy_array(data):
        proxy._set_data_from_cuda_interface(data)  # pylint: disable=W0212
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2020 by XGBoost Contributors
+ * Copyright 2020-2021 by XGBoost Contributors
 * \file categorical.h
 */
 #ifndef XGBOOST_COMMON_CATEGORICAL_H_
@@ -42,6 +42,11 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
  return !s_cats.Check(cat);
 }

+inline void CheckCat(bst_cat_t cat) {
+  CHECK_GE(cat, 0) << "Invalid categorical value detected.  Categorical value "
+                      "should be non-negative.";
+}
+
 struct IsCatOp {
  XGBOOST_DEVICE bool operator()(FeatureType ft) {
    return ft == FeatureType::kCategorical;
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -133,6 +133,7 @@ void RemoveDuplicatedCategories(
    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
    dh::device_vector<Entry> *p_sorted_entries,
    dh::caching_device_vector<size_t> *p_column_sizes_scan) {
+  info.feature_types.SetDevice(device);
  auto d_feature_types = info.feature_types.ConstDeviceSpan();
  CHECK(!d_feature_types.empty());
  auto &column_sizes_scan = *p_column_sizes_scan;
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -124,6 +124,11 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,

 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);
+
+void RemoveDuplicatedCategories(
+    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
+    dh::device_vector<Entry> *p_sorted_entries,
+    dh::caching_device_vector<size_t> *p_column_sizes_scan);
 }  // namespace detail

 // Compute sketch on DMatrix.
@@ -132,9 +137,10 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
                           size_t sketch_batch_num_elements = 0);

 template <typename AdapterBatch>
-void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
-                          size_t begin, size_t end, float missing,
-                          SketchContainer* sketch_container, int num_cuts) {
+void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
+                          int device, size_t columns, size_t begin, size_t end,
+                          float missing, SketchContainer *sketch_container,
+                          int num_cuts) {
  // Copy current subset of valid elements into temporary storage and sort
  dh::device_vector<Entry> sorted_entries;
  dh::caching_device_vector<size_t> column_sizes_scan;
@@ -142,6 +148,7 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
      thrust::make_counting_iterator(0llu),
      [=] __device__(size_t idx) { return batch.GetElement(idx); });
  HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
+  cuts_ptr.SetDevice(device);
  detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
                                 columns, num_cuts, device,
                                 &cuts_ptr,
@@ -151,8 +158,14 @@ void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
  thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
               sorted_entries.end(), detail::EntryCompareOp());

-  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
+  if (sketch_container->HasCategorical()) {
+    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
+  }
+
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
+  auto const &h_cuts_ptr = cuts_ptr.HostVector();
  // Extract the cuts from all columns concurrently
  sketch_container->Push(dh::ToSpan(sorted_entries),
                         dh::ToSpan(column_sizes_scan), d_cuts_ptr,
@@ -222,6 +235,12 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,

  detail::SortByWeight(&temp_weights, &sorted_entries);

+  if (sketch_container->HasCategorical()) {
+    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
+  }
+
  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();

@@ -274,8 +293,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
        device, num_cuts_per_feature, false);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
-      ProcessSlidingWindow(batch, device, num_cols,
-                           begin, end, missing, sketch_container, num_cuts_per_feature);
+      ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
+                           sketch_container, num_cuts_per_feature);
    }
  }
 }
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -21,6 +21,7 @@

 #include "array_interface.h"
 #include "../c_api/c_api_error.h"
+#include "../common/math.h"

 namespace xgboost {
 namespace data {
@@ -80,6 +81,24 @@ struct COOTuple {
  float value{0};
 };

+struct IsValidFunctor {
+  float missing;
+
+  XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
+
+  XGBOOST_DEVICE bool operator()(float value) const {
+    return !(common::CheckNAN(value) || value == missing);
+  }
+
+  XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
+    return !(common::CheckNAN(e.value) || e.value == missing);
+  }
+
+  XGBOOST_DEVICE bool operator()(const Entry& e) const {
+    return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
+  }
+};
+
 namespace detail {

 /**
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -987,18 +987,19 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread

  // Second pass over batch, placing elements in correct position

+  auto is_valid = data::IsValidFunctor{missing};
 #pragma omp parallel num_threads(nthread)
  {
    exec.Run([&]() {
      int tid = omp_get_thread_num();
-      size_t begin = tid*thread_size;
-      size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
+      size_t begin = tid * thread_size;
+      size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
      for (size_t i = begin; i < end; ++i) {
        auto line = batch.GetLine(i);
        for (auto j = 0ull; j < line.Size(); j++) {
          auto element = line.GetElement(j);
          const size_t key = (element.row_idx - base_rowid);
-          if (!common::CheckNAN(element.value) && element.value != missing) {
+          if (is_valid(element)) {
            builder.Push(key, Entry(element.column_idx, element.value), tid);
          }
        }
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -15,29 +15,6 @@
 namespace xgboost {
 namespace data {

-struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
-  float missing;
-
-  XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
-
-  __device__ bool operator()(float value) const {
-    return !(common::CheckNAN(value) || value == missing);
-  }
-
-  __device__ bool operator()(const data::COOTuple& e) const {
-    if (common::CheckNAN(e.value) || e.value == missing) {
-      return false;
-    }
-    return true;
-  }
-  __device__ bool operator()(const Entry& e) const {
-    if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
-      return false;
-    }
-    return true;
-  }
-};
-
 class CudfAdapterBatch : public detail::NoMetaInfo {
  friend class CudfAdapter;

--- a/src/data/iterative_device_dmatrix.cu
+++ b/src/data/iterative_device_dmatrix.cu
@@ -152,6 +152,7 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin

  if (batches == 1) {
    this->info_ = std::move(proxy->Info());
+    this->info_.num_nonzero_ = nnz;
    CHECK_EQ(proxy->Info().labels_.Size(), 0);
  }

--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -585,6 +585,7 @@ struct GPUHistMakerDevice {
      CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
          << "Categorical feature value too large.";
      auto cat = common::AsCat(candidate.split.fvalue);
+      common::CheckCat(cat);
      std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
      LBitField32 cats_bits(split_cats);
      cats_bits.Set(cat);
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -392,6 +392,52 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
  EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
 }

+void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
+                                  int32_t num_bins, bool weighted) {
+  auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
+  thrust::device_vector<float> x(h_x);
+  auto adapter = AdapterFromData(x, n, 1);
+  MetaInfo info;
+  info.num_row_ = n;
+  info.num_col_ = 1;
+  info.feature_types.HostVector().push_back(FeatureType::kCategorical);
+
+  if (weighted) {
+    std::vector<float> weights(n, 0);
+    SimpleLCG lcg;
+    SimpleRealUniformDistribution<float> dist(0, 1);
+    for (auto& v : weights) {
+      v = dist(&lcg);
+    }
+    info.weights_.HostVector() = weights;
+  }
+
+  ASSERT_EQ(info.feature_types.Size(), 1);
+  SketchContainer container(info.feature_types, num_bins, 1, n, 0);
+  AdapterDeviceSketch(adapter.Value(), num_bins, info,
+                      std::numeric_limits<float>::quiet_NaN(), &container);
+  HistogramCuts cuts;
+  container.MakeCuts(&cuts);
+
+  thrust::sort(x.begin(), x.end());
+  auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
+  ASSERT_NE(n_uniques, x.size());
+  ASSERT_EQ(cuts.TotalBins(), n_uniques);
+  ASSERT_EQ(n_uniques, num_categories);
+
+  auto& values = cuts.cut_values_.HostVector();
+  ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
+  auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
+  ASSERT_TRUE(is_unique);
+
+  x.resize(n_uniques);
+  h_x.resize(n_uniques);
+  thrust::copy(x.begin(), x.end(), h_x.begin());
+  for (decltype(n_uniques) i = 0; i < n_uniques; ++i) {
+    ASSERT_EQ(h_x[i], values[i]);
+  }
+}
+
 TEST(HistUtil, AdapterDeviceSketchCategorical) {
  int categorical_sizes[] = {2, 6, 8, 12};
  int num_bins = 256;
@@ -404,6 +450,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
      auto adapter = AdapterFromData(x_device, n, 1);
      ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
                          adapter.NumRows(), dmat.get());
+      TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
+      TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
    }
  }
 }
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -186,6 +186,37 @@ Arrow specification.'''
        assert len(Xy.feature_types) == X.shape[1]
        assert all(t == "c" for t in Xy.feature_types)

+        # test missing value
+        X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
+        X["f0"] = X["f0"].astype("category")
+        df, cat_codes, _, _ = xgb.data._transform_cudf_df(
+            X, None, None, enable_categorical=True
+        )
+        for col in cat_codes:
+            assert col.has_nulls
+
+        y = [0, 1, 2]
+        with pytest.raises(ValueError):
+            xgb.DMatrix(X, y)
+        Xy = xgb.DMatrix(X, y, enable_categorical=True)
+        assert Xy.num_row() == 3
+        assert Xy.num_col() == 1
+
+        with pytest.raises(ValueError):
+            xgb.DeviceQuantileDMatrix(X, y)
+
+        Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
+        assert Xy.num_row() == 3
+        assert Xy.num_col() == 1
+
+        X = X["f0"]
+        with pytest.raises(ValueError):
+            xgb.DMatrix(X, y)
+
+        Xy = xgb.DMatrix(X, y, enable_categorical=True)
+        assert Xy.num_row() == 3
+        assert Xy.num_col() == 1
+

@pytest.mark.skipif(**tm.no_cudf())
@pytest.mark.skipif(**tm.no_cupy())
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -138,9 +138,22 @@ class TestPandas:
            X, enable_categorical=True
        )

-        assert np.issubdtype(transformed[:, 0].dtype, np.integer)
        assert transformed[:, 0].min() == 0

+        # test missing value
+        X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
+        X["f0"] = X["f0"].astype("category")
+        arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
+        assert not np.any(arr == -1.0)
+
+        X = X["f0"]
+        with pytest.raises(ValueError):
+            xgb.DMatrix(X, y)
+
+        Xy = xgb.DMatrix(X, y, enable_categorical=True)
+        assert Xy.num_row() == 3
+        assert Xy.num_col() == 1
+
    def test_pandas_sparse(self):
        import pandas as pd
        rows = 100