Bump version to 1.7.4. (#8805 )

[backport] Fix CPU bin compression with categorical data. (#8809 ) (#8810 )
* [backport] Fix CPU bin compression with categorical data. (#8809) * Fix CPU bin compression with categorical data. * The bug causes the maximum category to be lesser than 256 or the maximum number of bins when the input data is dense. * Avoid test symbol.
2023-02-16 06:40:01 +08:00 · 2023-02-16 06:39:25 +08:00 · 2023-02-15 02:45:09 +08:00 · 2023-02-15 01:39:20 +08:00 · 2023-02-09 20:16:39 +08:00 · 2023-02-09 18:31:49 +08:00
45 changed files with 1417 additions and 1019 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-project(xgboost LANGUAGES CXX C VERSION 1.7.3)
+project(xgboost LANGUAGES CXX C VERSION 1.7.4)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 1.7.3.1
-Date: 2023-01-06
+Version: 1.7.4.1
+Date: 2023-02-15
 Authors@R: c(
  person("Tianqi", "Chen", role = c("aut"),
         email = "tianqi.tchen@gmail.com"),
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -328,8 +328,9 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
                                predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
                                reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
  object <- xgb.Booster.complete(object, saveraw = FALSE)
+
  if (!inherits(newdata, "xgb.DMatrix"))
-    newdata <- xgb.DMatrix(newdata, missing = missing)
+    newdata <- xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], -1))
  if (!is.null(object[["feature_names"]]) &&
      !is.null(colnames(newdata)) &&
      !identical(object[["feature_names"]], colnames(newdata)))
--- a/R-package/configure
+++ b/R-package/configure
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@@ -2,10 +2,25 @@

 AC_PREREQ(2.69)

-AC_INIT([xgboost],[1.7.3],[],[xgboost],[])
+AC_INIT([xgboost],[1.7.4],[],[xgboost],[])

-# Use this line to set CC variable to a C compiler
-AC_PROG_CC
+: ${R_HOME=`R RHOME`}
+if test -z "${R_HOME}"; then
+  echo "could not determine R_HOME"
+  exit 1
+fi
+
+CXX14=`"${R_HOME}/bin/R" CMD config CXX14`
+CXX14STD=`"${R_HOME}/bin/R" CMD config CXX14STD`
+CXX="${CXX14} ${CXX14STD}"
+CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
+
+CC=`"${R_HOME}/bin/R" CMD config CC`
+CFLAGS=`"${R_HOME}/bin/R" CMD config CFLAGS`
+CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS`
+
+LDFLAGS=`"${R_HOME}/bin/R" CMD config LDFLAGS`
+AC_LANG(C++)

 ### Check whether backtrace() is part of libc or the external lib libexecinfo
 AC_MSG_CHECKING([Backtrace lib])
@@ -40,7 +55,7 @@ then
  ac_pkg_openmp=no
  AC_MSG_CHECKING([whether OpenMP will work in a package])
  AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])])
-  ${CC} -o conftest conftest.c ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
+  ${CXX} -o conftest conftest.cpp ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
  AC_MSG_RESULT([${ac_pkg_openmp}])
  if test "${ac_pkg_openmp}" = no; then
    OPENMP_CXXFLAGS=''
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -23,7 +23,6 @@ PKG_LIBS = @OPENMP_CXXFLAGS@ @OPENMP_LIB@ @ENDIAN_FLAG@ @BACKTRACE_LIB@ -pthread
 OBJECTS= \
    ./xgboost_R.o \
    ./xgboost_custom.o \
-    ./xgboost_assert.o \
    ./init.o \
    $(PKGROOT)/src/metric/metric.o \
    $(PKGROOT)/src/metric/elementwise_metric.o \
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -23,7 +23,6 @@ PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)  -DDMLC_CMAKE_LITTLE_ENDIAN=1  $(SHLIB_PTHRE
 OBJECTS= \
    ./xgboost_R.o \
    ./xgboost_custom.o \
-    ./xgboost_assert.o \
    ./init.o \
    $(PKGROOT)/src/metric/metric.o \
    $(PKGROOT)/src/metric/elementwise_metric.o \
--- a/R-package/src/xgboost_assert.c
+++ b/R-package/src/xgboost_assert.c
@@ -1,26 +0,0 @@
-// Copyright (c) 2014 by Contributors
-#include <stdio.h>
-#include <stdarg.h>
-#include <Rinternals.h>
-
-// implements error handling
-void XGBoostAssert_R(int exp, const char *fmt, ...) {
-  char buf[1024];
-  if (exp == 0) {
-    va_list args;
-    va_start(args, fmt);
-    vsprintf(buf, fmt, args);
-    va_end(args);
-    error("AssertError:%s\n", buf);
-  }
-}
-void XGBoostCheck_R(int exp, const char *fmt, ...) {
-  char buf[1024];
-  if (exp == 0) {
-    va_list args;
-    va_start(args, fmt);
-    vsprintf(buf, fmt, args);
-    va_end(args);
-    error("%s\n", buf);
-  }
-}
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -6,6 +6,6 @@

 #define XGBOOST_VER_MAJOR 1
 #define XGBOOST_VER_MINOR 7
-#define XGBOOST_VER_PATCH 3
+#define XGBOOST_VER_PATCH 4

 #endif  // XGBOOST_VERSION_CONFIG_H_
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@

    <groupId>ml.dmlc</groupId>
    <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>1.7.3</version>
+    <version>1.7.4</version>
    <packaging>pom</packaging>
    <name>XGBoost JVM Package</name>
    <description>JVM Package for XGBoost</description>
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.3</version>
+        <version>1.7.4</version>
    </parent>
    <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>1.7.3</version>
+    <version>1.7.4</version>
    <packaging>jar</packaging>
    <build>
        <plugins>
@@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>1.7.3</version>
+            <version>1.7.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>1.7.3</version>
+            <version>1.7.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.3</version>
+        <version>1.7.4</version>
    </parent>
    <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>1.7.3</version>
+    <version>1.7.4</version>
    <build>
        <plugins>
            <plugin>
@@ -26,7 +26,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.7.3</version>
+            <version>1.7.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.3</version>
+        <version>1.7.4</version>
    </parent>
    <artifactId>xgboost4j-gpu_2.12</artifactId>
-    <version>1.7.3</version>
+    <version>1.7.4</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.3</version>
+        <version>1.7.4</version>
    </parent>
    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
    <build>
@@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
-            <version>1.7.3</version>
+            <version>1.7.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.3</version>
+        <version>1.7.4</version>
    </parent>
    <artifactId>xgboost4j-spark_2.12</artifactId>
    <build>
@@ -24,7 +24,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.7.3</version>
+            <version>1.7.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,10 +6,10 @@
    <parent>
        <groupId>ml.dmlc</groupId>
        <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.7.3</version>
+        <version>1.7.4</version>
    </parent>
    <artifactId>xgboost4j_2.12</artifactId>
-    <version>1.7.3</version>
+    <version>1.7.4</version>
    <packaging>jar</packaging>

    <dependencies>
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-1.7.3
+1.7.4
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -36,7 +36,6 @@ try:

    PANDAS_INSTALLED = True
 except ImportError:
-
    MultiIndex = object
    DataFrame = object
    Series = object
@@ -161,6 +160,7 @@ def concat(value: Sequence[_T]) -> _T:  # pylint: disable=too-many-return-statem
 # `importlib.utils`, except it's unclear from its document on how to use it.  This one
 # seems to be easy to understand and works out of box.

+
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2172,6 +2172,7 @@ class Booster:
        )
        return _prediction_output(shape, dims, preds, False)

+    # pylint: disable=too-many-statements
    def inplace_predict(
        self,
        data: DataType,
@@ -2192,10 +2193,10 @@ class Booster:

        .. code-block:: python

-            booster.set_param({'predictor': 'gpu_predictor'})
+            booster.set_param({"predictor": "gpu_predictor"})
            booster.inplace_predict(cupy_array)

-            booster.set_param({'predictor': 'cpu_predictor})
+            booster.set_param({"predictor": "cpu_predictor"})
            booster.inplace_predict(numpy_array)

        .. versionadded:: 1.1.0
@@ -2301,14 +2302,16 @@ class Booster:
            )
            return _prediction_output(shape, dims, preds, False)
        if isinstance(data, scipy.sparse.csr_matrix):
-            csr = data
+            from .data import _transform_scipy_csr
+
+            data = _transform_scipy_csr(data)
            _check_call(
                _LIB.XGBoosterPredictFromCSR(
                    self.handle,
-                    _array_interface(csr.indptr),
-                    _array_interface(csr.indices),
-                    _array_interface(csr.data),
-                    c_bst_ulong(csr.shape[1]),
+                    _array_interface(data.indptr),
+                    _array_interface(data.indices),
+                    _array_interface(data.data),
+                    c_bst_ulong(data.shape[1]),
                    from_pystr_to_cstr(json.dumps(args)),
                    p_handle,
                    ctypes.byref(shape),
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -30,6 +30,7 @@ from .core import (
    c_array,
    c_str,
    from_pystr_to_cstr,
+    make_jcargs,
 )

 DispatchedDataBackendReturnType = Tuple[
@@ -80,6 +81,21 @@ def _array_interface(data: np.ndarray) -> bytes:
    return interface_str


+def _transform_scipy_csr(data: DataType) -> DataType:
+    from scipy.sparse import csr_matrix
+
+    indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype)
+    indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype)
+    values, _ = _ensure_np_dtype(data.data, data.data.dtype)
+    if (
+        indptr is not data.indptr
+        or indices is not data.indices
+        or values is not data.data
+    ):
+        data = csr_matrix((values, indices, indptr), shape=data.shape)
+    return data
+
+
 def _from_scipy_csr(
    data: DataType,
    missing: FloatCompatible,
@@ -93,18 +109,14 @@ def _from_scipy_csr(
            f"length mismatch: {len(data.indices)} vs {len(data.data)}"
        )
    handle = ctypes.c_void_p()
-    args = {
-        "missing": float(missing),
-        "nthread": int(nthread),
-    }
-    config = bytes(json.dumps(args), "utf-8")
+    data = _transform_scipy_csr(data)
    _check_call(
        _LIB.XGDMatrixCreateFromCSR(
            _array_interface(data.indptr),
            _array_interface(data.indices),
            _array_interface(data.data),
            c_bst_ulong(data.shape[1]),
-            config,
+            make_jcargs(missing=float(missing), nthread=int(nthread)),
            ctypes.byref(handle),
        )
    )
@@ -153,12 +165,13 @@ def _is_numpy_array(data: DataType) -> bool:


 def _ensure_np_dtype(
-    data: DataType,
-    dtype: Optional[NumpyDType]
+    data: DataType, dtype: Optional[NumpyDType]
 ) -> Tuple[np.ndarray, Optional[NumpyDType]]:
    if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
-        data = data.astype(np.float32, copy=False)
        dtype = np.float32
+        data = data.astype(dtype, copy=False)
+    if not data.flags.aligned:
+        data = np.require(data, requirements="A")
    return data, dtype


@@ -1197,11 +1210,13 @@ def _proxy_transform(
        data, _ = _ensure_np_dtype(data, data.dtype)
        return data, None, feature_names, feature_types
    if _is_scipy_csr(data):
+        data = _transform_scipy_csr(data)
        return data, None, feature_names, feature_types
    if _is_pandas_df(data):
        arr, feature_names, feature_types = _transform_pandas_df(
            data, enable_categorical, feature_names, feature_types
        )
+        arr, _ = _ensure_np_dtype(arr, arr.dtype)
        return arr, None, feature_names, feature_types
    raise TypeError("Value type is not supported for data iterator:" + str(type(data)))

--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -140,6 +140,13 @@ _unsupported_predict_params = {
 }


+# TODO: supply hint message for all other unsupported params.
+_unsupported_params_hint_message = {
+    "enable_categorical": "`xgboost.spark` estimators do not have 'enable_categorical' param, "
+    "but you can set `feature_types` param and mark categorical features with 'c' string."
+}
+
+
 class _SparkXGBParams(
    HasFeaturesCol,
    HasLabelCol,
@@ -523,7 +530,10 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                    or k in _unsupported_predict_params
                    or k in _unsupported_train_params
                ):
-                    raise ValueError(f"Unsupported param '{k}'.")
+                    err_msg = _unsupported_params_hint_message.get(
+                        k, f"Unsupported param '{k}'."
+                    )
+                    raise ValueError(err_msg)
                _extra_params[k] = v
        _existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
        self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
@@ -749,6 +759,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            "feature_weights": self.getOrDefault(self.feature_weights),
            "missing": float(self.getOrDefault(self.missing)),
        }
+        if dmatrix_kwargs["feature_types"] is not None:
+            dmatrix_kwargs["enable_categorical"] = True
        booster_params["nthread"] = cpu_per_task
        use_gpu = self.getOrDefault(self.use_gpu)

--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
    feature_offsets_[fid] = accum_index;
  }

-  SetTypeSize(gmat.max_num_bins);
+  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
  index_.resize(storage_size, 0);
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2021 by Contributors
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
 * \file array_interface.h
 * \brief View of __array_interface__
 */
@@ -7,9 +7,11 @@
 #define XGBOOST_DATA_ARRAY_INTERFACE_H_

 #include <algorithm>
-#include <cinttypes>
+#include <cstddef>  // std::size_t
+#include <cstdint>
 #include <map>
 #include <string>
+#include <type_traits>  // std::alignment_of,std::remove_pointer_t
 #include <utility>
 #include <vector>

@@ -394,6 +396,11 @@ class ArrayInterface {

    data = ArrayInterfaceHandler::ExtractData(array, n);
    static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported.");
+
+    auto alignment = this->ElementAlignment();
+    auto ptr = reinterpret_cast<uintptr_t>(this->data);
+    CHECK_EQ(ptr % alignment, 0) << "Input pointer misalignment.";
+
    if (allow_mask) {
      common::Span<RBitField8::value_type> s_mask;
      size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask);
@@ -512,9 +519,15 @@ class ArrayInterface {
    return func(reinterpret_cast<uint64_t const *>(data));
  }

-  XGBOOST_DEVICE size_t ElementSize() {
-    return this->DispatchCall(
-        [](auto *p_values) { return sizeof(std::remove_pointer_t<decltype(p_values)>); });
+  XGBOOST_DEVICE std::size_t ElementSize() const {
+    return this->DispatchCall([](auto *typed_data_ptr) {
+      return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
+    });
+  }
+  XGBOOST_DEVICE std::size_t ElementAlignment() const {
+    return this->DispatchCall([](auto *typed_data_ptr) {
+      return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
+    });
  }

  template <typename T = float, typename... Index>
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -20,13 +20,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM

 GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
                                   double sparse_thresh, bool sorted_sketch, int32_t n_threads,
-                                   common::Span<float> hess) {
+                                   common::Span<float> hess)
+    : max_numeric_bins_per_feat{max_bins_per_feat} {
  CHECK(p_fmat->SingleColBlock());
  // We use sorted sketching for approx tree method since it's more efficient in
  // computation time (but higher memory usage).
  cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);

-  max_num_bins = max_bins_per_feat;
  const uint32_t nbins = cut.Ptrs().back();
  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(n_threads * nbins, 0);
@@ -63,7 +63,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
    : row_ptr(info.num_row_ + 1, 0),
      hit_count(cuts.TotalBins(), 0),
      cut{std::forward<common::HistogramCuts>(cuts)},
-      max_num_bins(max_bin_per_feat),
+      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}

 #if !defined(XGBOOST_USE_CUDA)
@@ -86,13 +86,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
 }

 GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
-                                   common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
-                                   bool isDense, double sparse_thresh, int32_t n_threads) {
+                                   common::HistogramCuts cuts, int32_t max_bins_per_feat,
+                                   bool isDense, double sparse_thresh, int32_t n_threads)
+    : cut{std::move(cuts)},
+      max_numeric_bins_per_feat{max_bins_per_feat},
+      base_rowid{batch.base_rowid},
+      isDense_{isDense} {
  CHECK_GE(n_threads, 1);
-  base_rowid = batch.base_rowid;
-  isDense_ = isDense;
-  cut = cuts;
-  max_num_bins = max_bins_per_feat;
  CHECK_EQ(row_ptr.size(), 0);
  // The number of threads is pegged to the batch size. If the OMP
  // block is parallelized on anything other than the batch/block size,
@@ -127,12 +127,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH

 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
-  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
+      isDense) {
    // compress dense index to uint8
    index.SetBinTypeSize(common::kUint8BinsTypeSize);
    index.Resize((sizeof(uint8_t)) * n_index);
-  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
-              max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
+  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
+              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
             isDense) {
    // compress dense index to uint16
    index.SetBinTypeSize(common::kUint16BinsTypeSize);
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,

 GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
                                   EllpackPage const& in_page, BatchParam const& p)
-    : max_num_bins{p.max_bin} {
+    : max_numeric_bins_per_feat{p.max_bin} {
  auto page = in_page.Impl();
  isDense_ = page->is_dense;

--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -133,11 +133,15 @@ class GHistIndexMatrix {
  std::vector<size_t> hit_count;
  /*! \brief The corresponding cuts */
  common::HistogramCuts cut;
-  /*! \brief max_bin for each feature. */
-  bst_bin_t max_num_bins;
+  /** \brief max_bin for each feature. */
+  bst_bin_t max_numeric_bins_per_feat;
  /*! \brief base row index for current page (used by external memory) */
  size_t base_rowid{0};

+  bst_bin_t MaxNumBinPerFeat() const {
+    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
+  }
+
  ~GHistIndexMatrix();
  /**
   * \brief Constrcutor for SimpleDMatrix.
@@ -160,7 +164,7 @@ class GHistIndexMatrix {
   * \brief Constructor for external memory.
   */
  GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
-                   common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
+                   common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
                   double sparse_thresh, int32_t n_threads);
  GHistIndexMatrix();  // also for ext mem, empty ctor so that we can read the cache back.

--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
    if (!fi->Read(&page->hit_count)) {
      return false;
    }
-    if (!fi->Read(&page->max_num_bins)) {
+    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
      return false;
    }
    if (!fi->Read(&page->base_rowid)) {
@@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
        sizeof(uint64_t);
    // max_bins, base row, is_dense
-    fo->Write(page.max_num_bins);
-    bytes += sizeof(page.max_num_bins);
+    fo->Write(page.max_numeric_bins_per_feat);
+    bytes += sizeof(page.max_numeric_bins_per_feat);
    fo->Write(page.base_rowid);
    bytes += sizeof(page.base_rowid);
    fo->Write(page.IsDense());
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -58,6 +58,13 @@ void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, Bat
    }
  };
  auto ellpack = [&]() {
+    // workaround ellpack being initialized from CPU.
+    if (p.gpu_id == Context::kCpuId) {
+      p.gpu_id = ref_->Ctx()->gpu_id;
+    }
+    if (p.gpu_id == Context::kCpuId) {
+      p.gpu_id = 0;
+    }
    for (auto const& page : ref_->GetBatches<EllpackPage>(p)) {
      GetCutsFromEllpack(page, p_cuts);
      break;
@@ -172,9 +179,9 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
    size_t i = 0;
    while (iter.Next()) {
      if (!p_sketch) {
-        p_sketch.reset(new common::HostSketchContainer{batch_param_.max_bin,
-                                                       proxy->Info().feature_types.ConstHostSpan(),
-                                                       column_sizes, false, ctx_.Threads()});
+        p_sketch.reset(new common::HostSketchContainer{
+            batch_param_.max_bin, proxy->Info().feature_types.ConstHostSpan(), column_sizes,
+            !proxy->Info().group_ptr_.empty(), ctx_.Threads()});
      }
      HostAdapterDispatch(proxy, [&](auto const& batch) {
        proxy->Info().num_nonzero_ = batch_nnz[i];
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -42,6 +42,7 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
    out->Info() = this->Info().Slice(ridxs);
    out->Info().num_nonzero_ = h_offset.back();
  }
+  out->ctx_ = this->ctx_;
  return out;
 }

--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -248,8 +248,10 @@ class EvaluateSplitAgent {

 template <int kBlockSize>
 __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
-    bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
-    const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
+    bst_feature_t max_active_features,
+    common::Span<const EvaluateSplitInputs> d_inputs,
+    const EvaluateSplitSharedInputs shared_inputs,
+    common::Span<bst_feature_t> sorted_idx,
    const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
    common::Span<DeviceSplitCandidate> out_candidates) {
  // Aligned && shared storage for best_split
@@ -263,11 +265,15 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
  __syncthreads();

  // Allocate blocks to one feature of one node
-  const auto input_idx = blockIdx.x / number_active_features;
+  const auto input_idx = blockIdx.x / max_active_features;
  const EvaluateSplitInputs &inputs = d_inputs[input_idx];
  // One block for each feature. Features are sampled, so fidx != blockIdx.x
-
-  int fidx = inputs.feature_set[blockIdx.x % number_active_features];
+  // Some blocks may not have any feature to work on, simply return
+  int feature_offset = blockIdx.x % max_active_features;
+  if (feature_offset >= inputs.feature_set.size()) {
+    return;
+  }
+  int fidx = inputs.feature_set[feature_offset];

  using AgentT = EvaluateSplitAgent<kBlockSize>;
  __shared__ typename AgentT::TempStorage temp_storage;
@@ -338,7 +344,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
 }

 void GPUHistEvaluator::LaunchEvaluateSplits(
-    bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
+    bst_feature_t max_active_features,
+    common::Span<const EvaluateSplitInputs> d_inputs,
    EvaluateSplitSharedInputs shared_inputs,
    TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
    common::Span<DeviceSplitCandidate> out_splits) {
@@ -346,20 +353,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
    this->SortHistogram(d_inputs, shared_inputs, evaluator);
  }

-  size_t combined_num_features = number_active_features * d_inputs.size();
-  dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features);
+  size_t combined_num_features = max_active_features * d_inputs.size();
+  dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(
+      combined_num_features, DeviceSplitCandidate());

  // One block for each feature
  uint32_t constexpr kBlockThreads = 32;
-  dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
-      EvaluateSplitsKernel<kBlockThreads>, number_active_features, d_inputs,
-      shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
+  dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
+                   0}(
+      EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
+      shared_inputs,
+      this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
      evaluator, dh::ToSpan(feature_best_splits));

  // Reduce to get best candidate for left and right child over all features
-  auto reduce_offset = dh::MakeTransformIterator<size_t>(
-      thrust::make_counting_iterator(0llu),
-      [=] __device__(size_t idx) -> size_t { return idx * number_active_features; });
+  auto reduce_offset =
+      dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
+                                        [=] __device__(size_t idx) -> size_t {
+                                          return idx * max_active_features;
+                                        });
  size_t temp_storage_bytes = 0;
  auto num_segments = out_splits.size();
  cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
@@ -386,15 +398,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
 }

 void GPUHistEvaluator::EvaluateSplits(
-    const std::vector<bst_node_t> &nidx, bst_feature_t number_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
+    const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
+    common::Span<const EvaluateSplitInputs> d_inputs,
+    EvaluateSplitSharedInputs shared_inputs,
    common::Span<GPUExpandEntry> out_entries) {
  auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();

  dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
  auto out_splits = dh::ToSpan(splits_out_storage);
-  this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator,
-                             out_splits);
+  this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
+                             evaluator, out_splits);

  auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
  auto d_entries = out_entries;
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -170,13 +170,18 @@ class GPUHistEvaluator {
      TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);

  // impl of evaluate splits, contains CUDA kernels so it's public
-  void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs,
+  void LaunchEvaluateSplits(
+      bst_feature_t max_active_features,
+      common::Span<const EvaluateSplitInputs> d_inputs,
+      EvaluateSplitSharedInputs shared_inputs,
      TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
      common::Span<DeviceSplitCandidate> out_splits);
  /**
   * \brief Evaluate splits for left and right nodes.
   */
-  void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,
+  void EvaluateSplits(const std::vector<bst_node_t> &nidx,
+                      bst_feature_t max_active_features,
+                      common::Span<const EvaluateSplitInputs> d_inputs,
                      EvaluateSplitSharedInputs shared_inputs,
                      common::Span<GPUExpandEntry> out_splits);
  /**
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -188,7 +188,8 @@ struct GPUHistMakerDevice {
  common::Span<GradientPair> gpair;

  dh::device_vector<int> monotone_constraints;
-  dh::device_vector<float> update_predictions;
+  // node idx for each sample
+  dh::device_vector<bst_node_t> positions;

  TrainParam param;

@@ -318,24 +319,27 @@ struct GPUHistMakerDevice {
      auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
      right_sampled_features->SetDevice(ctx_->gpu_id);
      common::Span<bst_feature_t> right_feature_set =
-          interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
-      h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
-                              left_feature_set, hist.GetNodeHistogram(left_nidx)};
-      h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
-                                  right_feature_set, hist.GetNodeHistogram(right_nidx)};
+          interaction_constraints.Query(right_sampled_features->DeviceSpan(),
+                                        right_nidx);
+      h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1,
+                              candidate.split.left_sum, left_feature_set,
+                              hist.GetNodeHistogram(left_nidx)};
+      h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1,
+                                  candidate.split.right_sum, right_feature_set,
+                                  hist.GetNodeHistogram(right_nidx)};
    }
-    bst_feature_t number_active_features = h_node_inputs[0].feature_set.size();
+    bst_feature_t max_active_features = 0;
    for (auto input : h_node_inputs) {
-      CHECK_EQ(input.feature_set.size(), number_active_features)
-          << "Current implementation assumes that the number of active features "
-             "(after sampling) in any node is the same";
+      max_active_features = std::max(max_active_features,
+                                     bst_feature_t(input.feature_set.size()));
    }
-    dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
-                                  h_node_inputs.size() * sizeof(EvaluateSplitInputs),
-                                  cudaMemcpyDefault));
+    dh::safe_cuda(cudaMemcpyAsync(
+        d_node_inputs.data().get(), h_node_inputs.data(),
+        h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));

-    this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs),
-                                    shared_inputs, dh::ToSpan(entries));
+    this->evaluator_.EvaluateSplits(nidx, max_active_features,
+                                    dh::ToSpan(d_node_inputs), shared_inputs,
+                                    dh::ToSpan(entries));
    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
                                  cudaMemcpyDeviceToHost));
@@ -423,7 +427,7 @@ struct GPUHistMakerDevice {
        LOG(FATAL) << "Current objective function can not be used with external memory.";
      }
      p_out_position->Resize(0);
-      update_predictions.clear();
+      positions.clear();
      return;
    }

@@ -458,8 +462,6 @@ struct GPUHistMakerDevice {
                              HostDeviceVector<bst_node_t>* p_out_position) {
    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
    auto d_gpair = this->gpair;
-    update_predictions.resize(row_partitioner->GetRows().size());
-    auto d_update_predictions = dh::ToSpan(update_predictions);
    p_out_position->SetDevice(ctx_->gpu_id);
    p_out_position->Resize(row_partitioner->GetRows().size());

@@ -494,32 +496,45 @@ struct GPUHistMakerDevice {
        node = d_nodes[position];
      }

-      d_update_predictions[row_id] = node.LeafValue();
      return position;
    };  // NOLINT

    auto d_out_position = p_out_position->DeviceSpan();
    row_partitioner->FinalisePosition(d_out_position, new_position_op);

+    auto s_position = p_out_position->ConstDeviceSpan();
+    positions.resize(s_position.size());
+    dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
+                                  s_position.size_bytes(), cudaMemcpyDeviceToDevice));
+
    dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
      bst_node_t position = d_out_position[idx];
-      d_update_predictions[idx] = d_nodes[position].LeafValue();
      bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
      d_out_position[idx] = is_row_sampled ? ~position : position;
    });
  }

  bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
-    if (update_predictions.empty()) {
+    if (positions.empty()) {
      return false;
    }
+
    CHECK(p_tree);
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
    CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
-    auto d_update_predictions = dh::ToSpan(update_predictions);
-    CHECK_EQ(out_preds_d.Size(), d_update_predictions.size());
-    dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
-      out_preds_d(idx) += d_update_predictions[idx];
+
+    auto d_position = dh::ToSpan(positions);
+    CHECK_EQ(out_preds_d.Size(), d_position.size());
+
+    auto const& h_nodes = p_tree->GetNodes();
+    dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
+    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
+                                  h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
+    auto d_nodes = dh::ToSpan(nodes);
+    dh::LaunchN(d_position.size(), [=] XGBOOST_DEVICE(std::size_t idx) mutable {
+      bst_node_t nidx = d_position[idx];
+      auto weight = d_nodes[nidx].LeafValue();
+      out_preds_d(idx) += weight;
    });
    return true;
  }
@@ -862,6 +877,7 @@ class GPUHistMaker : public TreeUpdater {
  std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker;  // NOLINT

  char const* Name() const override { return "grow_gpu_hist"; }
+  bool HasNodePosition() const override { return true; }

 private:
  bool initialised_{false};
--- a/tests/ci_build/conda_env/cpu_test.yml
+++ b/tests/ci_build/conda_env/cpu_test.yml
@@ -36,7 +36,8 @@ dependencies:
 - cloudpickle
 - shap
 - modin
+# TODO: Replace it with pyspark>=3.4 once 3.4 released.
+# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
+- pyspark>=3.3.1
 - pip:
  - datatable
-  # TODO: Replace it with pyspark>=3.4 once 3.4 released.
-  - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
--- a/tests/cpp/data/test_array_interface.cc
+++ b/tests/cpp/data/test_array_interface.cc
@@ -1,10 +1,12 @@
-/*!
- * Copyright 2020-2021 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
 #include "../helpers.h"
 #include "../../../src/data/array_interface.h"
+#include "dmlc/logging.h"
+#include "xgboost/json.h"

 namespace xgboost {
 TEST(ArrayInterface, Initialize) {
@@ -71,6 +73,14 @@ TEST(ArrayInterface, Error) {
  column["mask"]["data"] = Null{};
  common::Span<RBitField8::value_type> s_mask;
  EXPECT_THROW(ArrayInterfaceHandler::ExtractMask(column_obj, &s_mask), dmlc::Error);
+
+  get<Object>(column).erase("mask");
+  // misaligned.
+  j_data = {Json(Integer(reinterpret_cast<Integer::Int>(
+                reinterpret_cast<char const*>(storage.ConstHostPointer()) + 1))),
+            Json(Boolean(false))};
+  column["data"] = j_data;
+  EXPECT_THROW({ ArrayInterface<1> arr{column}; }, dmlc::Error);
 }

 TEST(ArrayInterface, GetElement) {
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -68,6 +68,30 @@ TEST(GradientIndex, FromCategoricalBasic) {
  }
 }

+TEST(GradientIndex, FromCategoricalLarge) {
+  size_t constexpr kRows = 1000, kCats = 512, kCols = 1;
+  bst_bin_t max_bins = 8;
+  auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
+  auto m = GetDMatrixFromData(x, kRows, 1);
+  Context ctx;
+
+  auto &h_ft = m->Info().feature_types.HostVector();
+  h_ft.resize(kCols, FeatureType::kCategorical);
+
+  BatchParam p{max_bins, 0.8};
+  {
+    GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, Context{}.Threads(), {});
+    ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
+  }
+  {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
+      common::HistogramCuts cut = page.cut;
+      GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
+      ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
+    }
+  }
+}
+
 TEST(GradientIndex, PushBatch) {
  size_t constexpr kRows = 64, kCols = 4;
  bst_bin_t max_bins = 64;
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -1,13 +1,19 @@
-// Copyright by Contributors
+/**
+ * Copyright 2016-2023 by XGBoost Contributors
+ */
 #include <xgboost/data.h>

-#include <array>
+#include <array>   // std::array
+#include <limits>  // std::numeric_limits
+#include <memory>  // std::unique_ptr

-#include "../../../src/data/adapter.h"
-#include "../../../src/data/simple_dmatrix.h"
+#include "../../../src/data/adapter.h"         // ArrayAdapter
+#include "../../../src/data/simple_dmatrix.h"  // SimpleDMatrix
 #include "../filesystem.h"                     // dmlc::TemporaryDirectory
-#include "../helpers.h"
+#include "../helpers.h"                        // RandomDataGenerator,CreateSimpleTestData
 #include "xgboost/base.h"
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/string_view.h"         // StringView

 using namespace xgboost;  // NOLINT

@@ -298,6 +304,17 @@ TEST(SimpleDMatrix, Slice) {
  ASSERT_EQ(out->Info().num_col_, out->Info().num_col_);
  ASSERT_EQ(out->Info().num_row_, ridxs.size());
  ASSERT_EQ(out->Info().num_nonzero_, ridxs.size() * kCols);  // dense
+
+  {
+    HostDeviceVector<float> data;
+    auto arr_str = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&data);
+    auto adapter = data::ArrayAdapter{StringView{arr_str}};
+    auto n_threads = 2;
+    std::unique_ptr<DMatrix> p_fmat{
+        DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), n_threads, "")};
+    std::unique_ptr<DMatrix> slice{p_fmat->Slice(ridxs)};
+    ASSERT_LE(slice->Ctx()->Threads(), n_threads);
+  }
 }

 TEST(SimpleDMatrix, SaveLoadBinary) {
--- a/tests/cpp/tree/test_node_partition.cc
+++ b/tests/cpp/tree/test_node_partition.cc
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/task.h>
+#include <xgboost/tree_updater.h>
+
+namespace xgboost {
+TEST(Updater, HasNodePosition) {
+  Context ctx;
+  ObjInfo task{ObjInfo::kRegression, true, true};
+  std::unique_ptr<TreeUpdater> up{TreeUpdater::Create("grow_histmaker", &ctx, task)};
+  ASSERT_TRUE(up->HasNodePosition());
+
+  up.reset(TreeUpdater::Create("grow_quantile_histmaker", &ctx, task));
+  ASSERT_TRUE(up->HasNodePosition());
+
+#if defined(XGBOOST_USE_CUDA)
+  ctx.gpu_id = 0;
+  up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, task));
+  ASSERT_TRUE(up->HasNodePosition());
+#endif  // defined(XGBOOST_USE_CUDA)
+}
+}  // namespace xgboost
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -139,3 +139,17 @@ class TestDeviceQuantileDMatrix:
            booster.predict(xgb.DMatrix(d_m.get_data())),
            atol=1e-6,
        )
+
+    def test_ltr(self) -> None:
+        import cupy as cp
+        X, y, qid, w = tm.make_ltr(100, 3, 3, 5)
+        # make sure GPU is used to run sketching.
+        cpX = cp.array(X)
+        Xy_qdm = xgb.QuantileDMatrix(cpX, y, qid=qid, weight=w)
+        Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
+        xgb.train({"tree_method": "gpu_hist", "objective": "rank:ndcg"}, Xy)
+
+        from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
+        from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
+
+        assert tm.predictor_equal(from_qdm, from_dm)
--- a/tests/python-gpu/test_gpu_interaction_constraints.py
+++ b/tests/python-gpu/test_gpu_interaction_constraints.py
@@ -1,8 +1,14 @@
-import numpy as np
 import sys
+
+import numpy as np
+import pandas as pd
+
+import xgboost as xgb
+
 sys.path.append("tests/python")
 # Don't import the test class, otherwise they will run twice.
 import test_interaction_constraints as test_ic  # noqa
+
 rng = np.random.RandomState(1994)


@@ -10,7 +16,34 @@ class TestGPUInteractionConstraints:
    cputest = test_ic.TestInteractionConstraints()

    def test_interaction_constraints(self):
-        self.cputest.run_interaction_constraints(tree_method='gpu_hist')
+        self.cputest.run_interaction_constraints(tree_method="gpu_hist")

    def test_training_accuracy(self):
-        self.cputest.training_accuracy(tree_method='gpu_hist')
+        self.cputest.training_accuracy(tree_method="gpu_hist")
+
+    # case where different number of features can occur in the evaluator
+    def test_issue_8730(self):
+        X = pd.DataFrame(
+            zip(range(0, 100), range(200, 300), range(300, 400), range(400, 500)),
+            columns=["A", "B", "C", "D"],
+        )
+        y = np.array([*([0] * 50), *([1] * 50)])
+        dm = xgb.DMatrix(X, label=y)
+
+        params = {
+            "eta": 0.16095019509249486,
+            "min_child_weight": 1,
+            "subsample": 0.688567929338029,
+            "colsample_bynode": 0.7,
+            "gamma": 5.666579817418348e-06,
+            "lambda": 0.14943712232059794,
+            "grow_policy": "depthwise",
+            "max_depth": 3,
+            "tree_method": "gpu_hist",
+            "interaction_constraints": [["A", "B"], ["B", "D", "C"], ["C", "D"]],
+            "objective": "count:poisson",
+            "eval_metric": "poisson-nloglik",
+            "verbosity": 0,
+        }
+
+        xgb.train(params, dm, num_boost_round=100)
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -338,13 +338,21 @@ class TestGPUPredict:
    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None, max_examples=20, print_blob=True)
    def test_predict_leaf_gbtree(self, param, dataset):
+        # Unsupported for random forest
+        if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
+            return
+
        param['booster'] = 'gbtree'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_predict_leaf_dart(self, param, dataset):
+    def test_predict_leaf_dart(self, param: dict, dataset: tm.TestDataset) -> None:
+        # Unsupported for random forest
+        if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
+            return
+
        param['booster'] = 'dart'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -326,7 +326,7 @@ class TestDMatrix:
        nrow = 100
        ncol = 1000
        x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
-        assert x.indices.max() < ncol - 1
+        assert x.indices.max() < ncol
        x.data[:] = 1
        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
        assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -9,7 +9,9 @@ from testing import (
    make_batches,
    make_batches_sparse,
    make_categorical,
+    make_ltr,
    make_sparse_regression,
+    predictor_equal,
 )

 import xgboost as xgb
@@ -218,6 +220,16 @@ class TestQuantileDMatrix:
        b = booster.predict(qXy)
        np.testing.assert_allclose(a, b)

+    def test_ltr(self) -> None:
+        X, y, qid, w = make_ltr(100, 3, 3, 5)
+        Xy_qdm = xgb.QuantileDMatrix(X, y, qid=qid, weight=w)
+        Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
+        xgb.train({"tree_method": "hist", "objective": "rank:ndcg"}, Xy)
+
+        from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
+        from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
+        assert predictor_equal(from_qdm, from_dm)
+
    # we don't test empty Quantile DMatrix in single node construction.
    @given(
        strategies.integers(1, 1000),
--- a/tests/python/test_spark/test_spark_local.py
+++ b/tests/python/test_spark/test_spark_local.py
@@ -41,6 +41,16 @@ logging.getLogger("py4j").setLevel(logging.INFO)
 pytestmark = testing.timeout(60)


+def no_sparse_unwrap():
+    try:
+        from pyspark.sql.functions import unwrap_udt
+
+    except ImportError:
+        return {"reason": "PySpark<3.4", "condition": True}
+
+    return {"reason": "PySpark<3.4", "condition": False}
+
+
 class XgboostLocalTest(SparkTestCase):
    def setUp(self):
        logging.getLogger().setLevel("INFO")
@@ -985,6 +995,7 @@ class XgboostLocalTest(SparkTestCase):
        model = classifier.fit(self.cls_df_train)
        model.transform(self.cls_df_test).collect()

+    @pytest.mark.skipif(**no_sparse_unwrap())
    def test_regressor_with_sparse_optim(self):
        regressor = SparkXGBRegressor(missing=0.0)
        model = regressor.fit(self.reg_df_sparse_train)
@@ -1001,6 +1012,7 @@ class XgboostLocalTest(SparkTestCase):
        for row1, row2 in zip(pred_result, pred_result2):
            self.assertTrue(np.isclose(row1.prediction, row2.prediction, atol=1e-3))

+    @pytest.mark.skipif(**no_sparse_unwrap())
    def test_classifier_with_sparse_optim(self):
        cls = SparkXGBClassifier(missing=0.0)
        model = cls.fit(self.cls_df_sparse_train)
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -458,6 +458,22 @@ class TestTreeMethod:
        config_0 = json.loads(booster_0.save_config())
        np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)

+        evals_result: Dict[str, Dict[str, list]] = {}
+        xgb.train(
+            {
+                "tree_method": tree_method,
+                "objective": "reg:absoluteerror",
+                "subsample": 0.8
+            },
+            Xy,
+            num_boost_round=10,
+            evals=[(Xy, "Train")],
+            evals_result=evals_result,
+        )
+        mae = evals_result["Train"]["mae"]
+        assert mae[-1] < 20.0
+        assert tm.non_increasing(mae)
+
    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.parametrize(
        "tree_method,weighted", [
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -466,7 +466,22 @@ def make_categorical(
    return df, label


-def _cat_sampled_from():
+def make_ltr(
+    n_samples: int, n_features: int, n_query_groups: int, max_rel: int
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Make a dataset for testing LTR."""
+    rng = np.random.default_rng(1994)
+    X = rng.normal(0, 1.0, size=n_samples * n_features).reshape(n_samples, n_features)
+    y = rng.integers(0, max_rel, size=n_samples)
+    qid = rng.integers(0, n_query_groups, size=n_samples)
+    w = rng.normal(0, 1.0, size=n_query_groups)
+    w -= np.min(w)
+    w /= np.max(w)
+    qid = np.sort(qid)
+    return X, y, qid, w
+
+
+def _cat_sampled_from() -> strategies.SearchStrategy:
    @strategies.composite
    def _make_cat(draw):
        n_samples = draw(strategies.integers(2, 512))
@@ -775,6 +790,19 @@ class DirectoryExcursion:
                os.remove(f)


+def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
+    """Assert whether two DMatrices contain the same predictors."""
+    lcsr = lhs.get_data()
+    rcsr = rhs.get_data()
+    return all(
+        (
+            np.array_equal(lcsr.data, rcsr.data),
+            np.array_equal(lcsr.indices, rcsr.indices),
+            np.array_equal(lcsr.indptr, rcsr.indptr),
+        )
+    )
+
+
@contextmanager
 def captured_output():
    """Reassign stdout temporarily in order to test printed statements
Author	SHA1	Message	Date
Jiaming Yuan	36ad160501	Bump version to 1.7.4. (#8805 )	2023-02-16 06:40:01 +08:00
Jiaming Yuan	c22f6db4bf	[backport] Fix CPU bin compression with categorical data. (#8809 ) (#8810 ) * [backport] Fix CPU bin compression with categorical data. (#8809) * Fix CPU bin compression with categorical data. * The bug causes the maximum category to be lesser than 256 or the maximum number of bins when the input data is dense. * Avoid test symbol.	2023-02-16 06:39:25 +08:00
Jiaming Yuan	f15a6d2b19	[backport] Fix ranking with quantile dmatrix and group weight. (#8762 ) (#8800 ) * [backport] Fix ranking with quantile dmatrix and group weight. (#8762) * backport test utilities.	2023-02-15 02:45:09 +08:00
Jiaming Yuan	08a547f5c2	[backport] Fix feature types param (#8772 ) (#8801 ) Signed-off-by: Weichen Xu <weichen.xu@databricks.com> Co-authored-by: WeichenXu <weichen.xu@databricks.com>	2023-02-15 01:39:20 +08:00
Jiaming Yuan	60303db2ee	[backport] Fix GPU L1 error. (#8749 ) (#8770 ) * [backport] Fix GPU L1 error. (#8749) * Fix backport.	2023-02-09 20:16:39 +08:00
Jiaming Yuan	df984f9c43	[backport] Fix different number of features in gpu_hist evaluator. (#8754 ) (#8769 ) Co-authored-by: Rory Mitchell <r.a.mitchell.nz@gmail.com>	2023-02-09 18:31:49 +08:00
Jiaming Yuan	2f22f8d49b	[backport] Make sure input numpy array is aligned. (#8690 ) (#8696 ) (#8734 ) * [backport] Make sure input numpy array is aligned. (#8690) - use `np.require` to specify that the alignment is required. - scipy csr as well. - validate input pointer in `ArrayInterface`. * Workaround CUDA warning. (#8696) * backport from half type support for alignment. * fix import.	2023-02-06 16:58:15 +08:00
Jiaming Yuan	68d86336d7	[backport] [R] fix OpenMP detection on macOS (#8684 ) (#8732 ) Co-authored-by: James Lamb <jaylamb20@gmail.com>	2023-01-29 12:43:10 +08:00
Jiaming Yuan	76bdca072a	[R] Fix threads used to create DMatrix in predict. (#8681 ) (#8682 )	2023-01-15 04:00:31 +08:00
Jiaming Yuan	021e6a842a	[backport] [R] Get CXX flags from R CMD config. (#8669 ) (#8680 )	2023-01-14 18:46:59 +08:00
Jiaming Yuan	e5bef4ffce	[backport] Fix threads in DMatrix slice. (#8667 ) (#8679 )	2023-01-14 18:46:04 +08:00
Jiaming Yuan	10bb0a74ef	[backport] [CI] Skip pyspark sparse tests. (#8675 ) (#8678 )	2023-01-14 06:40:17 +08:00
Jiaming Yuan	e803d06d8c	[backport] [R] Remove unused assert definition. (#8526 ) (#8668 )	2023-01-13 04:55:29 +08:00
@@ -1 +1 @@
 .7.3
 .7.4