sync Jun 1

2023-06-01 15:55:06 -07:00 · 2023-06-01 15:55:06 -07:00 · 42867a4805
commit 42867a4805
parent c5b575e00e fa2ab1f021
76 changed files with 1424 additions and 595 deletions
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@ -75,3 +75,13 @@ jobs:
      if: matrix.os == 'ubuntu-latest'  # Distributed training doesn't work on Windows
      env:
        RABIT_MOCK: ON
+
+
+    - name: Build and Test XGBoost4J with scala 2.13
+      run: |
+        rm -rfv build/
+        cd jvm-packages
+        mvn -B clean install test -Pdefault,scala-2.13
+      if: matrix.os == 'ubuntu-latest'  # Distributed training doesn't work on Windows
+      env:
+        RABIT_MOCK: ON
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -171,6 +171,8 @@ if (USE_CUDA)
  set(GEN_CODE "")
  format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
  add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
+
+  find_package(CUDAToolkit REQUIRED)
 endif (USE_CUDA)

 if (USE_HIP)
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -124,13 +124,6 @@ function(format_gencode_flags flags out)
  endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
 endfunction(format_gencode_flags flags)

-macro(enable_nvtx target)
-  find_package(NVTX REQUIRED)
-  target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}")
-  target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
-  target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
-endmacro()
-
 # Set CUDA related flags to target.  Must be used after code `format_gencode_flags`.
 function(xgboost_set_cuda_flags target)
  target_compile_options(${target} PRIVATE
@ -162,11 +155,14 @@ function(xgboost_set_cuda_flags target)
  endif (USE_DEVICE_DEBUG)

  if (USE_NVTX)
-    enable_nvtx(${target})
+    target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
  endif (USE_NVTX)

  target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
-  target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/gputreeshap)
+  target_include_directories(
+    ${target} PRIVATE
+    ${xgboost_SOURCE_DIR}/gputreeshap
+    ${CUDAToolkit_INCLUDE_DIRS})

  if (MSVC)
    target_compile_options(${target} PRIVATE
@ -314,7 +310,7 @@ macro(xgboost_target_link_libraries target)
  endif (USE_NCCL)

  if (USE_NVTX)
-    enable_nvtx(${target})
+    target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
  endif (USE_NVTX)

  if (RABIT_BUILD_MPI)
--- a/cmake/modules/FindNVTX.cmake
+++ b/cmake/modules/FindNVTX.cmake
@ -1,26 +0,0 @@
-if (NVTX_LIBRARY)
-  unset(NVTX_LIBRARY CACHE)
-endif (NVTX_LIBRARY)
-
-set(NVTX_LIB_NAME nvToolsExt)
-
-
-find_path(NVTX_INCLUDE_DIR
-  NAMES nvToolsExt.h
-  PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include)
-
-
-find_library(NVTX_LIBRARY
-  NAMES nvToolsExt
-  PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64)
-
-message(STATUS "Using nvtx library: ${NVTX_LIBRARY}")
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NVTX DEFAULT_MSG
-                                  NVTX_INCLUDE_DIR NVTX_LIBRARY)
-
-mark_as_advanced(
-  NVTX_INCLUDE_DIR
-  NVTX_LIBRARY
-)
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@ -38,19 +38,18 @@ def using_dask_matrix(client: Client, X, y):


 def using_quantile_device_dmatrix(client: Client, X, y):
-    """`DaskQuantileDMatrix` is a data type specialized for `gpu_hist`, tree
-     method that reduces memory overhead.  When training on GPU pipeline, it's
-     preferred over `DaskDMatrix`.
+    """`DaskQuantileDMatrix` is a data type specialized for `gpu_hist` and `hist` tree
+     methods for reducing memory usage.

    .. versionadded:: 1.2.0

    """
-    # Input must be on GPU for `DaskQuantileDMatrix`.
    X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
    y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))

-    # `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful
-    # that it can not be used for anything else other than training.
+    # `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not
+    # be used for anything else other than training unless a reference is specified. See
+    # the `ref` argument of `DaskQuantileDMatrix`.
    dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
    output = xgb.dask.train(
        client, {"verbosity": 2, "tree_method": "gpu_hist"}, dtrain, num_boost_round=4
--- a/doc/tutorials/custom_metric_obj.rst
+++ b/doc/tutorials/custom_metric_obj.rst
@ -27,20 +27,29 @@ In the following two sections, we will provide a step by step walk through of im
 the ``Squared Log Error (SLE)`` objective function:

 .. math::
-   \frac{1}{2}[log(pred + 1) - log(label + 1)]^2
+   \frac{1}{2}[\log(pred + 1) - \log(label + 1)]^2

 and its default metric ``Root Mean Squared Log Error(RMSLE)``:

 .. math::
-   \sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}
+   \sqrt{\frac{1}{N}[\log(pred + 1) - \log(label + 1)]^2}

 Although XGBoost has native support for said functions, using it for demonstration
 provides us the opportunity of comparing the result from our own implementation and the
 one from XGBoost internal for learning purposes.  After finishing this tutorial, we should
 be able to provide our own functions for rapid experiments.  And at the end, we will
 provide some notes on non-identy link function along with examples of using custom metric
-and objective with `scikit-learn` interface.
-with scikit-learn interface.
+and objective with the `scikit-learn` interface.
+
+If we compute the gradient of said objective function:
+
+.. math::
+   g = \frac{\partial{objective}}{\partial{pred}} = \frac{\log(pred + 1) - \log(label + 1)}{pred + 1}
+
+As well as the hessian (the second derivative of the objective):
+
+.. math::
+   h = \frac{\partial^2{objective}}{\partial{pred}} = \frac{ - \log(pred + 1) + \log(label + 1) + 1}{(pred + 1)^2}

 *****************************
 Customized Objective Function
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@ -519,6 +519,9 @@ Troubleshooting
  the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
  logs.

+- If NCCL fails to initialize in a container environment, it might be caused by limited
+  system shared memory. With docker, one can try the flag: `--shm-size=4g`.
+
 - MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
  that includes `Multiple processes within a communication group ...` upon initialization.

--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@ -1,5 +1,5 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2023, XGBoost contributors
 * \brief span class based on ISO++20 span
 *
 * About NOLINTs in this file:
@ -33,20 +33,16 @@
 #include <xgboost/logging.h>

 #include <cinttypes>  // size_t
-#include <limits>             // numeric_limits
-#include <iterator>
-#include <type_traits>
 #include <cstdio>
+#include <iterator>
+#include <limits>  // numeric_limits
+#include <type_traits>
+#include <utility>  // for move

 #if defined(__CUDACC__)
 #include <cuda_runtime.h>
 #elif defined(__HIP_PLATFORM_AMD__)
 #include <hip/hip_runtime.h>
-
-extern "C" void __assert_fail (const char *__assertion, const char *__file,
-      unsigned int __line, const char *__function)
-     noexcept (true) __attribute__ ((__noreturn__));
-
 #endif

 /*!
@ -127,7 +123,7 @@ namespace common {

 #define __ASSERT_STR_HELPER(x) #x

-#if 1
+#if 0
 #define HIP_KERNEL_CHECK(cond)  \
  (XGBOOST_EXPECT((cond), true) \
       ? static_cast<void>(0)   \
@ -710,6 +706,44 @@ XGBOOST_DEVICE auto as_writable_bytes(Span<T, E> s) __span_noexcept ->  // NOLIN
    Span<byte, detail::ExtentAsBytesValue<T, E>::value> {
  return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
 }
+
+/**
+ * \brief A simple custom Span type that uses general iterator instead of pointer.
+ */
+template <typename It>
+class IterSpan {
+ public:
+  using value_type = typename std::iterator_traits<It>::value_type;  // NOLINT
+  using index_type = std::size_t;                                    // NOLINT
+  using iterator = It;                                               // NOLINT
+
+ private:
+  It it_;
+  index_type size_{0};
+
+ public:
+  IterSpan() = default;
+  XGBOOST_DEVICE IterSpan(It it, index_type size) : it_{std::move(it)}, size_{size} {}
+  XGBOOST_DEVICE explicit IterSpan(common::Span<It, dynamic_extent> span)
+      : it_{span.data()}, size_{span.size()} {}
+
+  [[nodiscard]] XGBOOST_DEVICE index_type size() const noexcept { return size_; }  // NOLINT
+  [[nodiscard]] XGBOOST_DEVICE decltype(auto) operator[](index_type i) const { return it_[i]; }
+  [[nodiscard]] XGBOOST_DEVICE decltype(auto) operator[](index_type i) { return it_[i]; }
+  [[nodiscard]] XGBOOST_DEVICE bool empty() const noexcept { return size() == 0; }  // NOLINT
+  [[nodiscard]] XGBOOST_DEVICE It data() const noexcept { return it_; }             // NOLINT
+  [[nodiscard]] XGBOOST_DEVICE IterSpan<It> subspan(                                // NOLINT
+      index_type _offset, index_type _count = dynamic_extent) const {
+    SPAN_CHECK((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
+    return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
+  }
+  [[nodiscard]] XGBOOST_DEVICE constexpr iterator begin() const noexcept {  // NOLINT
+    return {this, 0};
+  }
+  [[nodiscard]] XGBOOST_DEVICE constexpr iterator end() const noexcept {  // NOLINT
+    return {this, size()};
+  }
+};
 }  // namespace common
 }  // namespace xgboost

--- a/jvm-packages/.gitignore
+++ b/jvm-packages/.gitignore
@ -1,2 +1,4 @@
 tracker.py
 build.sh
+xgboost4j-tester/pom.xml
+xgboost4j-tester/iris.csv
--- a/jvm-packages/README.md
+++ b/jvm-packages/README.md
@ -36,6 +36,19 @@ XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5.
    <version>latest_version_num</version>
 </dependency>
 ```
+or 
+```
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j_2.13</artifactId>
+    <version>latest_version_num</version>
+</dependency>
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.13</artifactId>
+    <version>latest_version_num</version>
+</dependency>
+```

 <b>sbt</b>
 ```sbt
@ -47,7 +60,6 @@ libraryDependencies ++= Seq(

 For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).

-To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.

 ### Access SNAPSHOT version

@ -85,6 +97,19 @@ Then add XGBoost4J as a dependency:
    <version>latest_version_num-SNAPSHOT</version>
 </dependency>
 ```
+or with scala 2.13 
+```
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j_2.13</artifactId>
+    <version>latest_version_num-SNAPSHOT</version>
+</dependency>
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.13</artifactId>
+    <version>latest_version_num-SNAPSHOT</version>
+</dependency>
+```

 <b>sbt</b>
 ```sbt
@ -96,7 +121,9 @@ libraryDependencies ++= Seq(

 For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html).

+### GPU algorithm
 To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
+Note that scala 2.13 is not supported by the [NVIDIA/spark-rapids#1525](https://github.com/NVIDIA/spark-rapids/issues/1525) yet, so the GPU algorithm can only be used with scala 2.12.

 ## Examples

--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -5,7 +5,7 @@
    <modelVersion>4.0.0</modelVersion>

    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost-jvm_2.12</artifactId>
+    <artifactId>xgboost-jvm</artifactId>
    <version>2.0.0-SNAPSHOT</version>
    <packaging>pom</packaging>
    <name>XGBoost JVM Package</name>
@ -33,7 +33,8 @@
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.17.0</flink.version>
+        <flink.version>1.17.1</flink.version>
+        <junit.version>4.13.2</junit.version>
        <spark.version>3.4.0</spark.version>
        <spark.version.gpu>3.3.2</spark.version.gpu>
        <scala.version>2.12.17</scala.version>
@ -45,6 +46,8 @@
        <cudf.version>23.04.0</cudf.version>
        <spark.rapids.version>23.04.1</spark.rapids.version>
        <cudf.classifier>cuda11</cudf.classifier>
+        <scalatest.version>3.2.16</scalatest.version>
+        <scala-collection-compat.version>2.10.0</scala-collection-compat.version>
      </properties>
    <repositories>
        <repository>
@ -71,6 +74,14 @@
            </modules>
        </profile>

+        <profile>
+            <id>scala-2.13</id>
+            <properties>
+                <scala.binary.version>2.13</scala.binary.version>
+                <scala.version>2.13.10</scala.version>
+            </properties>
+        </profile>
+
        <!-- gpu profile with both cpu and gpu test suites -->
        <profile>
            <id>gpu</id>
@ -451,7 +462,7 @@
        <plugins>
            <plugin>
                <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.4.3</version>
+                <version>3.4.4</version>
            </plugin>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
@ -467,6 +478,7 @@
        </plugins>
    </reporting>
    <dependencies>
+
        <dependency>
            <groupId>com.esotericsoftware</groupId>
            <artifactId>kryo</artifactId>
@ -483,6 +495,11 @@
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
+        <dependency>
+          <groupId>org.scala-lang.modules</groupId>
+          <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
+          <version>${scala-collection-compat.version}</version>
+        </dependency>
        <dependency>
            <groupId>commons-logging</groupId>
            <artifactId>commons-logging</artifactId>
@ -491,13 +508,13 @@
        <dependency>
            <groupId>org.scalatest</groupId>
            <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.2.16</version>
+            <version>${scalatest.version}</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.scalactic</groupId>
            <artifactId>scalactic_${scala.binary.version}</artifactId>
-            <version>3.2.15</version>
+            <version>${scalatest.version}</version>
            <scope>test</scope>
        </dependency>
    </dependencies>
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@ -5,10 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
        <version>2.0.0-SNAPSHOT</version>
    </parent>
-    <artifactId>xgboost4j-example_2.12</artifactId>
+    <name>xgboost4j-example</name>
+    <artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
    <version>2.0.0-SNAPSHOT</version>
    <packaging>jar</packaging>
    <build>
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
@ -73,12 +73,13 @@ object DistTrainWithFlink {
          .map(_.f1.f0)
          .returns(testDataTypeHint)

-    val paramMap = mapAsJavaMap(Map(
+    val paramMap = Map(
        ("eta", "0.1".asInstanceOf[AnyRef]),
        ("max_depth", "2"),
        ("objective", "binary:logistic"),
        ("verbosity", "1")
-    ))
+      )
+      .asJava

    // number of iterations
    val round = 2
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
@ -20,10 +20,9 @@ import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.tuning._
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types._
-
-import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}
+import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}

 // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)

@ -50,6 +49,13 @@ object SparkMLlibPipeline {
      .appName("XGBoost4J-Spark Pipeline Example")
      .getOrCreate()

+    run(spark, inputPath, nativeModelPath, pipelineModelPath, treeMethod, numWorkers)
+      .show(false)
+  }
+  private[spark] def run(spark: SparkSession, inputPath: String, nativeModelPath: String,
+                         pipelineModelPath: String, treeMethod: String,
+                         numWorkers: Int): DataFrame = {
+
    // Load dataset
    val schema = new StructType(Array(
      StructField("sepal length", DoubleType, true),
@ -90,11 +96,11 @@ object SparkMLlibPipeline {
    val labelConverter = new IndexToString()
      .setInputCol("prediction")
      .setOutputCol("realLabel")
-      .setLabels(labelIndexer.labels)
+      .setLabels(labelIndexer.labelsArray(0))

    val pipeline = new Pipeline()
      .setStages(Array(assembler, labelIndexer, booster, labelConverter))
-    val model = pipeline.fit(training)
+    val model: PipelineModel = pipeline.fit(training)

    // Batch prediction
    val prediction = model.transform(test)
@ -136,6 +142,6 @@ object SparkMLlibPipeline {

    // Load a saved model and serving
    val model2 = PipelineModel.load(pipelineModelPath)
-    model2.transform(test).show(false)
+    model2.transform(test)
  }
 }
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
@ -17,9 +17,8 @@
 package ml.dmlc.xgboost4j.scala.example.spark

 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
-
 import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

 // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
@ -38,6 +37,12 @@ object SparkTraining {

    val spark = SparkSession.builder().getOrCreate()
    val inputPath = args(0)
+    val results: DataFrame = run(spark, inputPath, treeMethod, numWorkers)
+    results.show()
+  }
+
+private[spark] def run(spark: SparkSession, inputPath: String,
+                       treeMethod: String, numWorkers: Int): DataFrame =  {
    val schema = new StructType(Array(
      StructField("sepal length", DoubleType, true),
      StructField("sepal width", DoubleType, true),
@ -81,7 +86,6 @@ object SparkTraining {
      setFeaturesCol("features").
      setLabelCol("classIndex")
    val xgbClassificationModel = xgbClassifier.fit(train)
-    val results = xgbClassificationModel.transform(test)
-    results.show()
+    xgbClassificationModel.transform(test)
  }
 }
--- a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
+++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
@ -0,0 +1,123 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.scala.example.spark
+
+import org.apache.spark.sql.SparkSession
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+import org.slf4j.LoggerFactory
+
+import java.io.File
+import java.nio.file.{Files, StandardOpenOption}
+import scala.jdk.CollectionConverters._
+import scala.util.{Random, Try}
+
+class SparkExamplesTest extends AnyFunSuite with BeforeAndAfterAll {
+  private val logger = LoggerFactory.getLogger(classOf[SparkExamplesTest])
+  private val random = new Random(42)
+  protected val numWorkers: Int = scala.math.min(Runtime.getRuntime.availableProcessors(), 4)
+
+  private val pathToTestDataset = Files.createTempFile("", "iris.csv").toAbsolutePath
+  private var spark: SparkSession = _
+
+  override def beforeAll(): Unit = {
+
+    def generateLine(i: Int): String = {
+      val getIrisName = (int: Int) => {
+        int % 3 match {
+          case 0 => "Iris-versicolor"
+          case 1 => "Iris-virginica"
+          case 2 => "Iris-setosa"
+        }
+      }
+      val generateValue = () => Math.abs(random.nextInt(99) * 0.1)
+      val sepalLength = generateValue()
+      val sepalWidth = generateValue()
+      val petalLength = generateValue()
+      val petalWidth = generateValue()
+      val irisName = getIrisName(Math.abs(random.nextInt()) + i)
+      s"$sepalLength,$sepalWidth,$petalLength,$petalWidth,$irisName"
+    }
+
+    if (spark == null) {
+       spark = SparkSession
+        .builder()
+        .appName("XGBoost4J-Spark Pipeline Example")
+        .master(s"local[${numWorkers}]")
+        .config("spark.ui.enabled", value = false)
+        .config("spark.driver.memory", "512m")
+        .config("spark.barrier.sync.timeout", 10)
+        .config("spark.task.cpus", 1)
+        .getOrCreate()
+      spark.sparkContext.setLogLevel("ERROR")
+    }
+    val data = (0 until 150)
+      .map(i => generateLine(i))
+      .toList
+      .asJava
+    Files.write(pathToTestDataset,
+      data,
+      StandardOpenOption.CREATE,
+      StandardOpenOption.WRITE,
+      StandardOpenOption.TRUNCATE_EXISTING)
+    logger.info(s"${new String(Files.readAllBytes(pathToTestDataset))}")
+
+  }
+
+  override def afterAll(): Unit = {
+    if (spark != null) {
+      spark.stop()
+      cleanExternalCache(spark.sparkContext.appName)
+      spark = null
+    }
+
+    Try(Files.deleteIfExists(pathToTestDataset))
+      .recover {
+        case e =>
+          logger.warn(
+            s"Could not delete temporary file $pathToTestDataset. Please, remove it manually",
+            e
+          )
+          true
+    }
+  }
+
+  private def cleanExternalCache(prefix: String): Unit = {
+    val dir = new File(".")
+    for (file <- dir.listFiles() if file.getName.startsWith(prefix)) {
+      file.delete()
+    }
+  }
+
+  test("Smoke test for SparkMLlibPipeline example") {
+    SparkMLlibPipeline.run(spark, pathToTestDataset.toString, "target/native-model",
+      "target/pipeline-model", "auto", 2)
+  }
+
+  test("Smoke test for SparkTraining example") {
+    val spark = SparkSession
+      .builder()
+      .appName("XGBoost4J-Spark Pipeline Example")
+      .master(s"local[${numWorkers}]")
+      .config("spark.ui.enabled", value = false)
+      .config("spark.driver.memory", "512m")
+      .config("spark.barrier.sync.timeout", 10)
+      .config("spark.task.cpus", 1)
+      .getOrCreate()
+
+    SparkTraining.run(spark, pathToTestDataset.toString, "auto", 2)
+  }
+}
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@ -5,9 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
        <version>2.0.0-SNAPSHOT</version>
    </parent>
+
+    <name>xgboost4j-flink</name>
    <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
    <version>2.0.0-SNAPSHOT</version>
    <properties>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@ -5,10 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
        <version>2.0.0-SNAPSHOT</version>
    </parent>
-    <artifactId>xgboost4j-gpu_2.12</artifactId>
+    <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
+    <name>xgboost4j-gpu</name>
    <version>2.0.0-SNAPSHOT</version>
    <packaging>jar</packaging>

@ -35,13 +36,13 @@
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
-            <version>4.13.2</version>
+            <version>${junit.version}</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.scalatest</groupId>
            <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.2.15</version>
+            <version>${scalatest.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@ -5,10 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
        <version>2.0.0-SNAPSHOT</version>
    </parent>
-    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
+    <name>xgboost4j-spark-gpu</name>
+    <artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>
    <build>
        <plugins>
            <plugin>
@ -24,7 +25,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
+            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@ -5,10 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
        <version>2.0.0-SNAPSHOT</version>
    </parent>
-    <artifactId>xgboost4j-spark_2.12</artifactId>
+    <name>xgboost4j-spark</name>
+    <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
    <build>
        <plugins>
            <plugin>
@ -24,7 +25,7 @@
        <dependency>
            <groupId>ml.dmlc</groupId>
            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
+            <version>${project.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@ -8,25 +8,28 @@ pom_template = """
  <modelVersion>4.0.0</modelVersion>

  <groupId>ml.dmlc</groupId>
-  <artifactId>xgboost4j-tester_2.12</artifactId>
+  <artifactId>xgboost4j-tester_{scala_binary_version}</artifactId>
  <version>1.0-SNAPSHOT</version>

-  <name>xgboost4j-tester_2.12</name>
+  <name>xgboost4j-tester</name>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>{maven_compiler_source}</maven.compiler.source>
    <maven.compiler.target>{maven_compiler_target}</maven.compiler.target>
+    <junit.version>4.13.2</junit.version>
    <spark.version>{spark_version}</spark.version>
    <scala.version>{scala_version}</scala.version>
+    <scalatest.version>3.2.15</scalatest.version>
    <scala.binary.version>{scala_binary_version}</scala.binary.version>
+    <kryo.version>5.5.0</kryo.version>
  </properties>

  <dependencies>
   <dependency>
      <groupId>com.esotericsoftware</groupId>
      <artifactId>kryo</artifactId>
-      <version>4.0.2</version>
+      <version>${{kryo.version}}</version>
    </dependency>
    <dependency>
      <groupId>org.scala-lang</groupId>
@ -48,29 +51,12 @@ pom_template = """
      <artifactId>commons-logging</artifactId>
      <version>1.2</version>
    </dependency>
-    <dependency>
-      <groupId>com.typesafe.akka</groupId>
-      <artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
-      <version>2.6.20</version>
-      <scope>test</scope>
-    </dependency>
    <dependency>
      <groupId>org.scalatest</groupId>
      <artifactId>scalatest_${{scala.binary.version}}</artifactId>
-      <version>3.0.8</version>
+      <version>${{scalatest.version}}</version>
      <scope>test</scope>
    </dependency>
-    <dependency>
-      <groupId>org.scalactic</groupId>
-      <artifactId>scalactic_${{scala.binary.version}}</artifactId>
-      <version>3.2.15</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-lang3</artifactId>
-      <version>3.9</version>
-    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_${{scala.binary.version}}</artifactId>
@ -92,7 +78,7 @@ pom_template = """
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
-      <version>4.13.2</version>
+      <version>${{junit.version}}</version>
      <scope>test</scope>
    </dependency>
    <dependency>
@ -122,36 +108,9 @@ pom_template = """

  <build>
    <plugins>
-      <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
-      <plugin>
-        <artifactId>maven-clean-plugin</artifactId>
-        <version>3.1.0</version>
-      </plugin>
-      <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
-      <plugin>
-        <artifactId>maven-resources-plugin</artifactId>
-        <version>3.0.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-compiler-plugin</artifactId>
-        <version>3.8.0</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-jar-plugin</artifactId>
-        <version>3.0.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-install-plugin</artifactId>
-        <version>2.5.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <version>2.8.2</version>
-      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-assembly-plugin</artifactId>
-        <version>2.4</version>
        <configuration>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
@ -171,22 +130,12 @@ pom_template = """
          </execution>
        </executions>
      </plugin>
-      <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
-      <plugin>
-        <artifactId>maven-site-plugin</artifactId>
-        <version>3.7.1</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-project-info-reports-plugin</artifactId>
-        <version>3.0.0</version>
-      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-surefire-plugin</artifactId>
-        <version>2.22.1</version>
        <configuration>
          <dependenciesToScan>
-            <dependency>ml.dmlc:xgboost4j_2.12</dependency>
+            <dependency>ml.dmlc:xgboost4j_${{scala.binary.version}}</dependency>
          </dependenciesToScan>
        </configuration>
      </plugin>
--- a/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java
+++ b/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java
@ -1,20 +0,0 @@
-package ml.dmlc.xgboost4j.tester;
-
-import static org.junit.Assert.assertTrue;
-
-import org.junit.Test;
-
-/**
- * Unit test for simple App.
- */
-public class AppTest 
-{
-    /**
-     * Rigorous Test :-)
-     */
-    @Test
-    public void shouldAnswerWithTrue()
-    {
-        assertTrue( true );
-    }
-}
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@ -5,10 +5,11 @@
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
        <version>2.0.0-SNAPSHOT</version>
    </parent>
-    <artifactId>xgboost4j_2.12</artifactId>
+    <name>xgboost4j</name>
+    <artifactId>xgboost4j_${scala.binary.version}</artifactId>
    <version>2.0.0-SNAPSHOT</version>
    <packaging>jar</packaging>

@ -28,13 +29,13 @@
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
-            <version>4.13.2</version>
+            <version>${junit.version}</version>
            <scope>test</scope>
        </dependency>
        <dependency>
          <groupId>org.scalatest</groupId>
          <artifactId>scalatest_${scala.binary.version}</artifactId>
-          <version>3.2.16</version>
+          <version>${scalatest.version}</version>
          <scope>provided</scope>
        </dependency>
    </dependencies>
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala
@ -37,7 +37,7 @@ trait EvalTrait extends IEvaluation {
   */
  def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float

-  private[scala] def eval(predicts: Array[Array[Float]], jdmat: java.DMatrix): Float = {
+  def eval(predicts: Array[Array[Float]], jdmat: java.DMatrix): Float = {
    require(predicts.length == jdmat.getLabel.length, "predicts size and label size must match " +
      s" predicts size: ${predicts.length}, label size: ${jdmat.getLabel.length}")
    eval(predicts, new DMatrix(jdmat))
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala
@ -31,7 +31,7 @@ trait ObjectiveTrait extends IObjective {
   */
  def getGradient(predicts: Array[Array[Float]], dtrain: DMatrix): List[Array[Float]]

-  private[scala] def getGradient(predicts: Array[Array[Float]], dtrain: JDMatrix):
+  def getGradient(predicts: Array[Array[Float]], dtrain: JDMatrix):
    java.util.List[Array[Float]] = {
    getGradient(predicts, new DMatrix(dtrain)).asJava
  }
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala
@ -17,12 +17,11 @@
 package ml.dmlc.xgboost4j.scala

 import java.io.InputStream
+import ml.dmlc.xgboost4j.java.{XGBoostError, XGBoost => JXGBoost}

-import ml.dmlc.xgboost4j.java.{XGBoostError, Booster => JBooster, XGBoost => JXGBoost}
-import scala.collection.JavaConverters._
-
+import scala.jdk.CollectionConverters._
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.Path

 /**
  * XGBoost Scala Training function.
@ -40,7 +39,12 @@ object XGBoost {
      earlyStoppingRound: Int = 0,
      prevBooster: Booster,
      checkpointParams: Option[ExternalCheckpointParams]): Booster = {
-    val jWatches = watches.mapValues(_.jDMatrix).asJava
+
+    // we have to filter null value for customized obj and eval
+    val jParams: java.util.Map[String, AnyRef] =
+      params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).toMap.asJava
+
+    val jWatches = watches.mapValues(_.jDMatrix).toMap.asJava
    val jBooster = if (prevBooster == null) {
      null
    } else {
@ -51,8 +55,7 @@ object XGBoost {
      map(cp => {
          JXGBoost.trainAndSaveCheckpoint(
            dtrain.jDMatrix,
-            // we have to filter null value for customized obj and eval
-            params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).asJava,
+            jParams,
            numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster,
            cp.checkpointInterval,
            cp.checkpointPath,
@ -61,8 +64,7 @@ object XGBoost {
      getOrElse(
        JXGBoost.train(
          dtrain.jDMatrix,
-          // we have to filter null value for customized obj and eval
-          params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).asJava,
+          jParams,
          numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster)
      )
    if (prevBooster == null) {
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -82,9 +82,10 @@ def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array
    if isinstance(data, str):
        return bytes(data, "utf-8")
    if isinstance(data, list):
-        pointers: ctypes.Array[ctypes.c_char_p] = (ctypes.c_char_p * len(data))()
-        data_as_bytes = [bytes(d, "utf-8") for d in data]
-        pointers[:] = data_as_bytes  # type: ignore
+        data_as_bytes: List[bytes] = [bytes(d, "utf-8") for d in data]
+        pointers: ctypes.Array[ctypes.c_char_p] = (
+            ctypes.c_char_p * len(data_as_bytes)
+        )(*data_as_bytes)
        return pointers
    raise TypeError()

@ -319,7 +320,7 @@ def _cuda_array_interface(data: DataType) -> bytes:
 def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.ndarray:
    """Convert a ctypes pointer array to a numpy array."""
    ctype: Type[CNumeric] = _numpy2ctypes_type(dtype)
-    if not isinstance(cptr, ctypes.POINTER(ctype)):  # type: ignore
+    if not isinstance(cptr, ctypes.POINTER(ctype)):
        raise RuntimeError(f"expected {ctype} pointer")
    res = np.zeros(length, dtype=dtype)
    if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
@ -2460,9 +2461,9 @@ class Booster:
            raise TypeError("Unknown file type: ", fname)

        if self.attr("best_iteration") is not None:
-            self.best_iteration = int(self.attr("best_iteration"))  # type: ignore
+            self.best_iteration = int(cast(int, self.attr("best_iteration")))
        if self.attr("best_score") is not None:
-            self.best_score = float(self.attr("best_score"))  # type: ignore
+            self.best_score = float(cast(float, self.attr("best_score")))

    def num_boosted_rounds(self) -> int:
        """Get number of boosted rounds.  For gblinear this is reset to 0 after
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -882,7 +882,7 @@ def _transform_cupy_array(data: DataType) -> CupyT:

    if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"):
        data = cupy.array(data, copy=False)
-    if data.dtype.hasobject or data.dtype in [cupy.float16, cupy.bool_]:
+    if data.dtype.hasobject or data.dtype in [cupy.bool_]:
        data = data.astype(cupy.float32, copy=False)
    return data

--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@ -337,10 +337,8 @@ class _SparkXGBParams(

        if self.getOrDefault(self.features_cols):
            if not self.getOrDefault(self.use_gpu):
-                raise ValueError("features_cols param requires enabling use_gpu.")
-
-            get_logger(self.__class__.__name__).warning(
-                "If features_cols param set, then features_col param is ignored."
+                raise ValueError(
+                    "features_col param with list value requires enabling use_gpu."
                )

        if self.getOrDefault("objective") is not None:
@ -547,6 +545,8 @@ FeatureProp = namedtuple(


 class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
+    _input_kwargs: Dict[str, Any]
+
    def __init__(self) -> None:
        super().__init__()
        self._set_xgb_params_default()
@ -576,6 +576,11 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            raise ValueError("Invalid param name: 'arbitrary_params_dict'.")

        for k, v in kwargs.items():
+            # We're not allowing user use features_cols directly.
+            if k == self.features_cols.name:
+                raise ValueError(
+                    f"Unsupported param '{k}' please use features_col instead."
+                )
            if k in _inverse_pyspark_param_alias_map:
                raise ValueError(
                    f"Please use param name {_inverse_pyspark_param_alias_map[k]} instead."
@ -591,6 +596,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                    k = real_k

            if self.hasParam(k):
+                if k == "features_col" and isinstance(v, list):
+                    self._set(**{"features_cols": v})
+                else:
                    self._set(**{str(k): v})
            else:
                if (
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@ -1,10 +1,13 @@
 """Xgboost pyspark integration submodule for estimator API."""
 # pylint: disable=too-many-ancestors
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
+# pylint: disable=unused-argument, too-many-locals

-from typing import Any, Type
+
+from typing import Any, Dict, List, Optional, Type, Union

 import numpy as np
+from pyspark import keyword_only
 from pyspark.ml.param import Param, Params
 from pyspark.ml.param.shared import HasProbabilityCol, HasRawPredictionCol

@ -83,8 +86,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
    :py:class:`~pyspark.ml.classification.OneVsRest`

    SparkXGBRegressor automatically supports most of the parameters in
-    `xgboost.XGBRegressor` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBRegressor` fit and predict method.
+    :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.

    SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
    see doc below for more details.
@ -97,13 +100,23 @@ class SparkXGBRegressor(_SparkXGBEstimator):
    SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
    param for each xgboost worker will be set equal to `spark.task.cpus` config value.

-    callbacks:
-        The export and import of the callback functions are at best effort.
-        For details, see :py:attr:`xgboost.spark.SparkXGBRegressor.callbacks` param doc.
-    validation_indicator_col
-        For params related to `xgboost.XGBRegressor` training
-        with evaluation dataset's supervision, set
-        :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col`
+
+    Parameters
+    ----------
+
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    pred_contrib_col:
+        Contribution prediction column name.
+    validation_indicator_col:
+        For params related to `xgboost.XGBRegressor` training with
+        evaluation dataset's supervision,
+        set :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col`
        parameter instead of setting the `eval_set` parameter in `xgboost.XGBRegressor`
        fit method.
    weight_col:
@ -111,26 +124,40 @@ class SparkXGBRegressor(_SparkXGBEstimator):
        :py:attr:`xgboost.spark.SparkXGBRegressor.weight_col` parameter instead of setting
        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRegressor`
        fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBRegressorModel.get_booster`.
-    num_workers:
-        Integer that specifies the number of XGBoost workers to use.
-        Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        Boolean that specifies whether the executors are running on GPU
-        instances.
    base_margin_col:
        To specify the base margins of the training and validation
        dataset, set :py:attr:`xgboost.spark.SparkXGBRegressor.base_margin_col` parameter
        instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBRegressor` fit method. Note: this isn't available for distributed
-        training.
+        `xgboost.XGBRegressor` fit method.

-    .. Note:: The Parameters chart above contains parameters that need special handling.
+    num_workers:
+        How many XGBoost workers to be used to train.
+        Each XGBoost worker corresponds to one spark task.
+    use_gpu:
+        Boolean value to specify whether the executors are running on GPU
+        instances.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
+    repartition_random_shuffle:
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
+
+    kwargs:
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html
+
+    Note
+    ----
+
+    The Parameters chart above contains parameters that need special handling.
    For a full list of parameters, see entries with `Param(parent=...` below.

-    .. Note:: This API is experimental.
+    This API is experimental.
+

    Examples
    --------
@ -155,9 +182,27 @@ class SparkXGBRegressor(_SparkXGBEstimator):

    """

-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **kwargs: Dict[str, Any],
+    ) -> None:
        super().__init__()
-        self.setParams(**kwargs)
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)

    @classmethod
    def _xgb_cls(cls) -> Type[XGBRegressor]:
@ -199,8 +244,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
    :py:class:`~pyspark.ml.classification.OneVsRest`

    SparkXGBClassifier automatically supports most of the parameters in
-    `xgboost.XGBClassifier` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBClassifier` fit and predict method.
+    :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.

    SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
    see doc below for more details.
@ -220,13 +265,21 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
    Parameters
    ----------

-    callbacks:
-        The export and import of the callback functions are at best effort. For
-        details, see :py:attr:`xgboost.spark.SparkXGBClassifier.callbacks` param doc.
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    probability_col:
+        Column name for predicted class conditional probabilities. Default to probabilityCol
    raw_prediction_col:
        The `output_margin=True` is implicitly supported by the
        `rawPredictionCol` output column, which is always returned with the predicted margin
        values.
+    pred_contrib_col:
+        Contribution prediction column name.
    validation_indicator_col:
        For params related to `xgboost.XGBClassifier` training with
        evaluation dataset's supervision,
@ -238,26 +291,39 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
        fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBClassifierModel.get_booster`.
-    num_workers:
-        Integer that specifies the number of XGBoost workers to use.
-        Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        Boolean that specifies whether the executors are running on GPU
-        instances.
    base_margin_col:
        To specify the base margins of the training and validation
        dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
        instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
-        training.
+        `xgboost.XGBClassifier` fit method.

-    .. Note:: The Parameters chart above contains parameters that need special handling.
+    num_workers:
+        How many XGBoost workers to be used to train.
+        Each XGBoost worker corresponds to one spark task.
+    use_gpu:
+        Boolean value to specify whether the executors are running on GPU
+        instances.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
+    repartition_random_shuffle:
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
+
+    kwargs:
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html
+
+    Note
+    ----
+
+    The Parameters chart above contains parameters that need special handling.
    For a full list of parameters, see entries with `Param(parent=...` below.

-    .. Note:: This API is experimental.
+    This API is experimental.

    Examples
    --------
@ -281,14 +347,34 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction

    """

-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        probability_col: str = "probability",
+        raw_prediction_col: str = "rawPrediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **kwargs: Dict[str, Any],
+    ) -> None:
        super().__init__()
        # The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
        # but in pyspark we will automatically set objective param depending on
        # binary or multinomial input dataset, and we need to remove the fixed default
        # param value as well to avoid causing ambiguity.
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)
        self._setDefault(objective=None)
-        self.setParams(**kwargs)

    @classmethod
    def _xgb_cls(cls) -> Type[XGBClassifier]:
@ -334,8 +420,8 @@ class SparkXGBRanker(_SparkXGBEstimator):
    :py:class:`~pyspark.ml.classification.OneVsRest`

    SparkXGBRanker automatically supports most of the parameters in
-    `xgboost.XGBRanker` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBRanker` fit and predict method.
+    :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.

    SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
    see doc below for more details.
@ -355,39 +441,53 @@ class SparkXGBRanker(_SparkXGBEstimator):
    Parameters
    ----------

-    callbacks:
-        The export and import of the callback functions are at best effort. For
-        details, see :py:attr:`xgboost.spark.SparkXGBRanker.callbacks` param doc.
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    pred_contrib_col:
+        Contribution prediction column name.
    validation_indicator_col:
        For params related to `xgboost.XGBRanker` training with
        evaluation dataset's supervision,
-        set :py:attr:`xgboost.spark.XGBRanker.validation_indicator_col`
-        parameter instead of setting the `eval_set` parameter in `xgboost.XGBRanker`
+        set :py:attr:`xgboost.spark.SparkXGBRanker.validation_indicator_col`
+        parameter instead of setting the `eval_set` parameter in :py:class:`xgboost.XGBRanker`
        fit method.
    weight_col:
        To specify the weight of the training and validation dataset, set
        :py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting
-        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRanker`
+        `sample_weight` and `sample_weight_eval_set` parameter in :py:class:`xgboost.XGBRanker`
        fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBRankerModel.get_booster`.
-    num_workers:
-        Integer that specifies the number of XGBoost workers to use.
-        Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        Boolean that specifies whether the executors are running on GPU
-        instances.
    base_margin_col:
        To specify the base margins of the training and validation
        dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.base_margin_col` parameter
        instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBRanker` fit method.
+        :py:class:`xgboost.XGBRanker` fit method.
    qid_col:
-        To specify the qid of the training and validation
-        dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.qid_col` parameter
-        instead of setting `qid` / `group`, `eval_qid` / `eval_group` in the
-        `xgboost.XGBRanker` fit method.
+        Query id column name.
+
+    num_workers:
+        How many XGBoost workers to be used to train.
+        Each XGBoost worker corresponds to one spark task.
+    use_gpu:
+        Boolean value to specify whether the executors are running on GPU
+        instances.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
+    repartition_random_shuffle:
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
+
+    kwargs:
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html

    .. Note:: The Parameters chart above contains parameters that need special handling.
        For a full list of parameters, see entries with `Param(parent=...` below.
@ -426,9 +526,28 @@ class SparkXGBRanker(_SparkXGBEstimator):
    >>> model.transform(df_test).show()
    """

-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        qid_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **kwargs: Dict[str, Any],
+    ) -> None:
        super().__init__()
-        self.setParams(**kwargs)
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)

    @classmethod
    def _xgb_cls(cls) -> Type[XGBRanker]:
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@ -3,6 +3,7 @@
 */
 #pragma once
 #include <string>
+#include <vector>

 #include "communicator.h"

@ -224,5 +225,46 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
  Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }

+template <typename T>
+struct AllgatherVResult {
+  std::vector<std::size_t> offsets;
+  std::vector<std::size_t> sizes;
+  std::vector<T> result;
+};
+
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * We assume each worker has the same number of inputs, but each input may be of a different size.
+ *
+ * @param inputs All the inputs from the local worker.
+ * @param sizes  Sizes of each input.
+ */
+template <typename T>
+inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
+                                      std::vector<std::size_t> const &sizes) {
+  auto num_inputs = sizes.size();
+
+  // Gather the sizes across all workers.
+  std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
+  std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
+  collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
+
+  // Calculate input offsets (std::exclusive_scan).
+  std::vector<std::size_t> offsets(all_sizes.size());
+  for (std::size_t i = 1; i < offsets.size(); i++) {
+    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
+  }
+
+  // Gather all the inputs.
+  auto total_input_size = offsets.back() + all_sizes.back();
+  std::vector<T> all_inputs(total_input_size);
+  std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
+  // We cannot use allgather here, since each worker might have a different size.
+  Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
+
+  return {offsets, all_sizes, all_inputs};
+}
+
 }  // namespace collective
 }  // namespace xgboost
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@ -12,19 +12,22 @@
 namespace xgboost {
 namespace collective {

-thread_local int Communicator::device_ordinal_{-1};
 thread_local std::unique_ptr<DeviceCommunicator> Communicator::device_communicator_{};

 void Communicator::Finalize() {
  communicator_->Shutdown();
  communicator_.reset(new NoOpCommunicator());
-  device_ordinal_ = -1;
  device_communicator_.reset(nullptr);
 }

 DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
-  if (!device_communicator_ || device_ordinal_ != device_ordinal) {
-    device_ordinal_ = device_ordinal;
+  thread_local auto old_device_ordinal = -1;
+  // If the number of GPUs changes, we need to re-initialize NCCL.
+  thread_local auto old_world_size = -1;
+  if (!device_communicator_ || device_ordinal != old_device_ordinal ||
+      communicator_->GetWorldSize() != old_world_size) {
+    old_device_ordinal = device_ordinal;
+    old_world_size = communicator_->GetWorldSize();
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
    if (type_ != CommunicatorType::kFederated) {
      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@ -229,7 +229,6 @@ class Communicator {
  static thread_local std::unique_ptr<Communicator> communicator_;
  static thread_local CommunicatorType type_;
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  static thread_local int device_ordinal_;
  static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif

--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023, XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
@ -16,21 +16,39 @@ struct CUDAContext {
  /**
   * \brief Caching thrust policy.
   */
-#if defined(XGBOOST_USE_HIP)
-  auto CTP() const { return thrust::hip::par(caching_alloc_).on(dh::DefaultStream()); }
+  auto CTP() const {
+#if defined(XGBOOST_USE_CUDA)
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
 #else
-  auto CTP() const { return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream()); }
+    return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
+#elif defined(XGBOOST_USE_HIP)
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::hip::par_nosync(caching_alloc_).on(dh::DefaultStream());
+#else
+    return thrust::hip::par(caching_alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
 #endif
-
+  }
  /**
   * \brief Thrust policy without caching allocator.
   */
-#if defined(XGBOOST_USE_HIP)
-  auto TP() const { return thrust::hip::par(alloc_).on(dh::DefaultStream()); }
+  auto TP() const {
+#if defined(XGBOOST_USE_CUDA)
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::cuda::par_nosync(alloc_).on(dh::DefaultStream());
 #else
-  auto TP() const { return thrust::cuda::par(alloc_).on(dh::DefaultStream()); }
+    return thrust::cuda::par(alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
+#elif defined(XGBOOST_USE_HIP)
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::hip::par_nosync(alloc_).on(dh::DefaultStream());
+#else
+    return thrust::hip::par(alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
 #endif
-
+  }
  auto Stream() const { return dh::DefaultStream(); }
 };
 }  // namespace xgboost
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -227,9 +227,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
        return {0, e.index, e.fvalue};  // row_idx is not needed for scanning column size.
      });
  detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
-                             batch_it, dummy_is_valid,
-                             0, sorted_entries.size(),
-                             &cuts_ptr, &column_sizes_scan);
+                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
+                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();

  if (sketch_container->HasCategorical()) {
@ -296,9 +295,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
      });
  detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
-                             batch_it, dummy_is_valid,
-                             0, sorted_entries.size(),
-                             &cuts_ptr, &column_sizes_scan);
+                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
+                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  if (sketch_container->HasCategorical()) {
    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@ -17,6 +17,10 @@
 #include "quantile.cuh"
 #include "timer.h"

+#if defined(XGBOOST_USE_HIP)
+namespace cub = hipcub;
+#endif
+
 namespace xgboost {
 namespace common {
 namespace cuda {
@ -53,24 +57,128 @@ struct EntryCompareOp {
 };

 // Get column size from adapter batch and for output cuts.
-template <typename Iter>
-void GetColumnSizesScan(int device, size_t num_columns, size_t num_cuts_per_feature,
-                        Iter batch_iter, data::IsValidFunctor is_valid,
-                        size_t begin, size_t end,
+template <std::uint32_t kBlockThreads, typename CounterT, typename BatchIt>
+__global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
+                                             data::IsValidFunctor is_valid,
+                                             Span<std::size_t> out_column_size) {
+  extern __shared__ char smem[];
+
+  auto smem_cs_ptr = reinterpret_cast<CounterT*>(smem);
+
+  dh::BlockFill(smem_cs_ptr, out_column_size.size(), 0);
+
+  cub::CTA_SYNC();
+
+  auto n = batch_iter.size();
+
+  for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n)) {
+    auto e = batch_iter[idx];
+    if (is_valid(e)) {
+      atomicAdd(&smem_cs_ptr[e.column_idx], static_cast<CounterT>(1));
+    }
+  }
+
+  cub::CTA_SYNC();
+
+  auto out_global_ptr = out_column_size;
+  for (auto i : dh::BlockStrideRange(static_cast<std::size_t>(0), out_column_size.size())) {
+    atomicAdd(&out_global_ptr[i], static_cast<std::size_t>(smem_cs_ptr[i]));
+  }
+}
+
+template <std::uint32_t kBlockThreads, typename Kernel>
+std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+  int n_mps = 0;
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
+#endif
+  int n_blocks_per_mp = 0;
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+                                                              kBlockThreads, shared_mem));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+                                                              kBlockThreads, shared_mem));
+#endif
+  std::uint32_t grid_size = n_blocks_per_mp * n_mps;
+  return grid_size;
+}
+
+/**
+ * \brief Get the size of each column. This is a histogram with additional handling of
+ *        invalid values.
+ *
+ * \tparam BatchIt                 Type of input adapter batch.
+ * \tparam force_use_global_memory Used for testing. Force global atomic add.
+ * \tparam force_use_u64           Used for testing. For u64 as counter in shared memory.
+ *
+ * \param device     CUDA device ordinal.
+ * \param batch_iter Iterator for input data from adapter batch.
+ * \param is_valid   Whehter an element is considered as missing.
+ * \param out_column_size Output buffer for the size of each column.
+ */
+template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
+void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+                               data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
+  thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
+
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  // Not strictly correct as we should use number of samples to determine the type of
+  // counter. However, the sample size is not known due to sliding window on number of
+  // elements.
+  std::size_t n = batch_iter.size();
+
+  std::size_t required_shared_memory = 0;
+  bool use_u32{false};
+  if (!force_use_u64 && n < static_cast<std::size_t>(std::numeric_limits<std::uint32_t>::max())) {
+    required_shared_memory = out_column_size.size() * sizeof(std::uint32_t);
+    use_u32 = true;
+  } else {
+    required_shared_memory = out_column_size.size() * sizeof(std::size_t);
+    use_u32 = false;
+  }
+  bool use_shared = required_shared_memory <= max_shared_memory && required_shared_memory != 0;
+
+  if (!force_use_global_memory && use_shared) {
+    CHECK_NE(required_shared_memory, 0);
+    std::uint32_t constexpr kBlockThreads = 512;
+    if (use_u32) {
+      CHECK(!force_use_u64);
+      auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
+      auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+          kernel, batch_iter, is_valid, out_column_size);
+    } else {
+      auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
+      auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+          kernel, batch_iter, is_valid, out_column_size);
+    }
+  } else {
+    auto d_out_column_size = out_column_size;
+    dh::LaunchN(batch_iter.size(), [=] __device__(size_t idx) {
+      auto e = batch_iter[idx];
+      if (is_valid(e)) {
+        atomicAdd(&d_out_column_size[e.column_idx], static_cast<size_t>(1));
+      }
+    });
+  }
+}
+
+template <typename BatchIt>
+void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+                        IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
                        HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                        dh::caching_device_vector<size_t>* column_sizes_scan) {
-  column_sizes_scan->resize(num_columns + 1, 0);
+  column_sizes_scan->resize(num_columns + 1);
  cuts_ptr->SetDevice(device);
  cuts_ptr->Resize(num_columns + 1, 0);

  dh::XGBCachingDeviceAllocator<char> alloc;
-  auto d_column_sizes_scan = column_sizes_scan->data().get();
-  dh::LaunchN(end - begin, [=] __device__(size_t idx) {
-    auto e = batch_iter[begin + idx];
-    if (is_valid(e)) {
-      atomicAdd(&d_column_sizes_scan[e.column_idx], static_cast<size_t>(1));
-    }
-  });
+  auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan);
+  LaunchGetColumnSizeKernel(device, batch_iter, is_valid, d_column_sizes_scan);
  // Calculate cuts CSC pointer
  auto cut_ptr_it = dh::MakeTransformIterator<size_t>(
      column_sizes_scan->begin(), [=] __device__(size_t column_size) {
@ -85,8 +193,7 @@ void GetColumnSizesScan(int device, size_t num_columns, size_t num_cuts_per_feat
                         column_sizes_scan->end(), column_sizes_scan->begin());
 #elif defined(XGBOOST_USE_CUDA)
  thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
-                         cut_ptr_it + column_sizes_scan->size(),
-                         cuts_ptr->DevicePointer());
+                         cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
  thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
                         column_sizes_scan->end(), column_sizes_scan->begin());
 #endif
@ -130,29 +237,26 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,

 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
-void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
-                            Range1d range, float missing,
-                            size_t columns, size_t cuts_per_feature, int device,
+void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
+                            float missing, size_t columns, size_t cuts_per_feature, int device,
                            HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                            dh::caching_device_vector<size_t>* column_sizes_scan,
                            dh::device_vector<Entry>* sorted_entries) {
  auto entry_iter = dh::MakeTransformIterator<Entry>(
      thrust::make_counting_iterator(0llu), [=] __device__(size_t idx) {
-        return Entry(batch.GetElement(idx).column_idx,
-                     batch.GetElement(idx).value);
+        return Entry(batch.GetElement(idx).column_idx, batch.GetElement(idx).value);
      });
+  auto n = range.end() - range.begin();
+  auto span = IterSpan{batch_iter + range.begin(), n};
  data::IsValidFunctor is_valid(missing);
  // Work out how many valid entries we have in each column
-  GetColumnSizesScan(device, columns, cuts_per_feature,
-                     batch_iter, is_valid,
-                     range.begin(), range.end(),
-                     cut_sizes_scan,
+  GetColumnSizesScan(device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
                     column_sizes_scan);
  size_t num_valid = column_sizes_scan->back();
  // Copy current subset of valid elements into temporary storage and sort
  sorted_entries->resize(num_valid);
-  dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(),
-             sorted_entries->begin(), is_valid);
+  dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
+             is_valid);
 }

 void SortByWeight(dh::device_vector<float>* weights,
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@ -209,7 +209,7 @@ class PartitionBuilder {
                BitVector* decision_bits, BitVector* missing_bits) {
    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
+    bst_feature_t fid = tree.SplitIndex(nid);
    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
    auto node_cats = tree.NodeCats(nid);
    auto const& cut_values = gmat.cut.Values();
@ -263,14 +263,13 @@ class PartitionBuilder {
  template <typename ExpandEntry>
  void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                       const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
-                       const size_t* rid, BitVector const& decision_bits,
+                       const RegTree& tree, const size_t* rid, BitVector const& decision_bits,
                       BitVector const& missing_bits) {
    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
    std::size_t nid = nodes[node_in_set].nid;
-    bool default_left = tree[nid].DefaultLeft();
+    bool default_left = tree.DefaultLeft(nid);

    auto pred = [&](auto ridx) {
      bool go_left = default_left;
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@ -7,7 +7,6 @@
 #include <utility>

 #include "../collective/aggregator.h"
-#include "../collective/communicator-inl.h"
 #include "../data/adapter.h"
 #include "categorical.h"
 #include "hist_util.h"
@ -143,6 +142,7 @@ struct QuantileAllreduce {

 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
+    MetaInfo const& info,
    std::vector<typename WQSketch::SummaryContainer> const &reduced,
    std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
    std::vector<typename WQSketch::Entry> *p_global_sketches) {
@ -168,7 +168,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
  std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);

  // Gather all column pointers
-  collective::Allreduce<collective::Operation::kSum>(sketches_scan.data(), sketches_scan.size());
+  collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size());
  for (int32_t i = 0; i < world; ++i) {
    size_t back = (i + 1) * (n_columns + 1) - 1;
    auto n_entries = sketches_scan.at(back);
@ -196,7 +196,8 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(

  static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
                "Unexpected size of sketch entry.");
-  collective::Allreduce<collective::Operation::kSum>(
+  collective::GlobalSum(
+      info,
      reinterpret_cast<float *>(global_sketches.data()),
      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
 }
@ -222,8 +223,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
  std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
  size_t feat_begin = rank * feature_ptr.size();  // pointer to current worker
  std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
-  collective::Allreduce<collective::Operation::kSum>(global_feat_ptrs.data(),
-                                                     global_feat_ptrs.size());
+  collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size());

  // move all categories into a flatten vector to prepare for allreduce
  size_t total = feature_ptr.back();
@ -236,8 +236,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
  // indptr for indexing workers
  std::vector<size_t> global_worker_ptr(world_size + 1, 0);
  global_worker_ptr[rank + 1] = total;  // shift 1 to right for constructing the indptr
-  collective::Allreduce<collective::Operation::kSum>(global_worker_ptr.data(),
-                                                     global_worker_ptr.size());
+  collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size());
  std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
  // total number of categories in all workers with all features
  auto gtotal = global_worker_ptr.back();
@ -249,8 +248,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
  CHECK_EQ(rank_size, total);
  std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
  // gather values from all workers.
-  collective::Allreduce<collective::Operation::kSum>(global_categories.data(),
-                                                     global_categories.size());
+  collective::GlobalSum(info, global_categories.data(), global_categories.size());
  QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
                                            categories_.size()};
  ParallelFor(categories_.size(), n_threads_, [&](auto fidx) {
@ -323,7 +321,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
  std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);

  std::vector<typename WQSketch::Entry> global_sketches;
-  this->GatherSketchInfo(reduced, &worker_segments, &sketches_scan, &global_sketches);
+  this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches);

  std::vector<typename WQSketch::SummaryContainer> final_sketches(n_columns);

@ -371,7 +369,9 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
    InvalidCategory();
  }
  auto &cut_values = cuts->cut_values_.HostVector();
-  auto max_cat = *std::max_element(categories.cbegin(), categories.cend());
+  // With column-wise data split, the categories may be empty.
+  auto max_cat =
+      categories.empty() ? 0.0f : *std::max_element(categories.cbegin(), categories.cend());
  CheckMaxCat(max_cat, categories.size());
  for (bst_cat_t i = 0; i <= AsCat(max_cat); ++i) {
    cut_values.push_back(i);
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@ -822,7 +822,8 @@ class SketchContainerImpl {
    return group_ind;
  }
  // Gather sketches from all workers.
-  void GatherSketchInfo(std::vector<typename WQSketch::SummaryContainer> const &reduced,
+  void GatherSketchInfo(MetaInfo const& info,
+                        std::vector<typename WQSketch::SummaryContainer> const &reduced,
                        std::vector<bst_row_t> *p_worker_segments,
                        std::vector<bst_row_t> *p_sketches_scan,
                        std::vector<typename WQSketch::Entry> *p_global_sketches);
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@ -26,6 +26,12 @@
 #include "xgboost/logging.h"
 #include "xgboost/span.h"

+#if defined(XGBOOST_USE_CUDA)
+#include "cuda_fp16.h"
+#elif defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_fp16.h>
+#endif
+
 namespace xgboost {
 // Common errors in parsing columnar format.
 struct ArrayInterfaceErrors {
@ -304,12 +310,12 @@ class ArrayInterfaceHandler {
 template <typename T, typename E = void>
 struct ToDType;
 // float
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
 template <>
 struct ToDType<__half> {
  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
 };
-#endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
 template <>
 struct ToDType<float> {
  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
@ -459,11 +465,11 @@ class ArrayInterface {
      CHECK(sizeof(long double) == 16) << error::NoF128();
      type = T::kF16;
    } else if (typestr[1] == 'f' && typestr[2] == '2') {
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
      type = T::kF2;
 #else
      LOG(FATAL) << "Half type is not supported.";
-#endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
    } else if (typestr[1] == 'f' && typestr[2] == '4') {
      type = T::kF4;
    } else if (typestr[1] == 'f' && typestr[2] == '8') {
@ -490,20 +496,17 @@ class ArrayInterface {
    }
  }

-  XGBOOST_DEVICE size_t Shape(size_t i) const { return shape[i]; }
-  XGBOOST_DEVICE size_t Stride(size_t i) const { return strides[i]; }
+  [[nodiscard]] XGBOOST_DEVICE std::size_t Shape(size_t i) const { return shape[i]; }
+  [[nodiscard]] XGBOOST_DEVICE std::size_t Stride(size_t i) const { return strides[i]; }

  template <typename Fn>
  XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const {
    using T = ArrayInterfaceHandler::Type;
    switch (type) {
      case T::kF2: {
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
        return func(reinterpret_cast<__half const *>(data));
-#else
-        SPAN_CHECK(false);
-        return func(reinterpret_cast<float const *>(data));
-#endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined(XGBOOST_USE_CUDA) || || defined(__HIP_PLATFORM_AMD__)
      }
      case T::kF4:
        return func(reinterpret_cast<float const *>(data));
@ -540,23 +543,23 @@ class ArrayInterface {
    return func(reinterpret_cast<uint64_t const *>(data));
  }

-  XGBOOST_DEVICE std::size_t ElementSize() const {
+  [[nodiscard]] XGBOOST_DEVICE std::size_t ElementSize() const {
    return this->DispatchCall([](auto *typed_data_ptr) {
      return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
    });
  }
-  XGBOOST_DEVICE std::size_t ElementAlignment() const {
+  [[nodiscard]] XGBOOST_DEVICE std::size_t ElementAlignment() const {
    return this->DispatchCall([](auto *typed_data_ptr) {
      return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
    });
  }

  template <typename T = float, typename... Index>
-  XGBOOST_DEVICE T operator()(Index &&...index) const {
+  XGBOOST_HOST_DEV_INLINE T operator()(Index &&...index) const {
    static_assert(sizeof...(index) <= D, "Invalid index.");
    return this->DispatchCall([=](auto const *p_values) -> T {
      std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
      // No operator defined for half -> size_t
      using Type = std::conditional_t<
          std::is_same<__half,
@ -566,7 +569,7 @@ class ArrayInterface {
      return static_cast<T>(static_cast<Type>(p_values[offset]));
 #else
      return static_cast<T>(p_values[offset]);
-#endif
+#endif  // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
    });
  }

@ -603,7 +606,7 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
  };
  switch (array.type) {
    case ArrayInterfaceHandler::kF2: {
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
      dispatch(__half{});
 #endif
      break;
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -698,6 +698,9 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
    this->feature_type_names = that.feature_type_names;
    auto &h_feature_types = feature_types.HostVector();
    LoadFeatureType(this->feature_type_names, &h_feature_types);
+  } else if (!that.feature_types.Empty()) {
+    this->feature_types.Resize(that.feature_types.Size());
+    this->feature_types.Copy(that.feature_types);
  }
  if (!that.feature_weights.Empty()) {
    this->feature_weights.Resize(that.feature_weights.Size());
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@ -29,7 +29,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
      : columns_(columns),
        num_rows_(num_rows) {}
  size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ COOTuple GetElement(size_t idx) const {
+  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
    size_t column_idx = idx % columns_.size();
    size_t row_idx = idx / columns_.size();
    auto const& column = columns_[column_idx];
@ -39,6 +39,14 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
    return {row_idx, column_idx, value};
  }

+  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+    auto const& column = columns_[fidx];
+    float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
+                      ? column(ridx)
+                      : std::numeric_limits<float>::quiet_NaN();
+    return value;
+  }
+
  XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
  XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }

@ -166,6 +174,10 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
    float value = array_interface_(row_idx, column_idx);
    return {row_idx, column_idx, value};
  }
+  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+    float value = array_interface_(ridx, fidx);
+    return value;
+  }

  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
@ -202,40 +214,64 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {

 // Returns maximum row length
 template <typename AdapterBatchT>
-size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
-                    int device_idx, float missing) {
-
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_idx));
-#elif defined(XGBOOST_USE_CUDA)
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+                         float missing) {
+#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(device_idx));
 #endif

  IsValidFunctor is_valid(missing);
-  // Count elements per row
-  dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
-    auto element = batch.GetElement(idx);
-    if (is_valid(element)) {
-      atomicAdd(reinterpret_cast<unsigned long long*>(  // NOLINT
-                    &offset[element.row_idx]),
-                static_cast<unsigned long long>(1));  // NOLINT
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
+#endif
+
+  auto n_samples = batch.NumRows();
+  bst_feature_t n_features = batch.NumCols();
+
+  // Use more than 1 threads for each row in case of dataset being too wide.
+  bst_feature_t stride{0};
+  if (n_features < 32) {
+    stride = std::min(n_features, 4u);
+  } else if (n_features < 64) {
+    stride = 8;
+  } else if (n_features < 128) {
+    stride = 16;
+  } else {
+    stride = 32;
  }
+
+  // Count elements per row
+  dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
+    bst_row_t cnt{0};
+    auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
+    SPAN_CHECK(ridx < n_samples);
+    for (bst_feature_t fidx = fbeg; fidx < n_features; fidx += stride) {
+      if (is_valid(batch.GetElement(ridx, fidx))) {
+        cnt++;
+      }
+    }
+
+    atomicAdd(reinterpret_cast<unsigned long long*>(  // NOLINT
+                  &offset[ridx]),
+              static_cast<unsigned long long>(cnt));  // NOLINT
  });

  dh::XGBCachingDeviceAllocator<char> alloc;
-
-#if defined(XGBOOST_USE_HIP)
-  size_t row_stride =
-      dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
-                 thrust::device_pointer_cast(offset.data()) + offset.size(),
-                 static_cast<std::size_t>(0), thrust::maximum<size_t>());
-#elif defined(XGBOOST_USE_CUDA)
-  size_t row_stride =
+#if defined(XGBOOST_USE_CUDA)
+  bst_row_t row_stride =
      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
                 thrust::device_pointer_cast(offset.data()) + offset.size(),
-                 static_cast<std::size_t>(0), thrust::maximum<size_t>());
+                 static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
+#elif defined(XGBOOST_USE_HIP)
+  bst_row_t row_stride =
+      dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
+                 thrust::device_pointer_cast(offset.data()) + offset.size(),
+                 static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
 #endif
-
  return row_stride;
 }

@ -243,13 +279,29 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
 * \brief Check there's no inf in data.
 */
 template <typename AdapterBatchT>
-bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
+bool NoInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
  auto counting = thrust::make_counting_iterator(0llu);
-  auto value_iter = dh::MakeTransformIterator<float>(
-      counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
-  auto valid =
-      thrust::none_of(value_iter, value_iter + batch.Size(),
-                      [is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
+  auto value_iter = dh::MakeTransformIterator<bool>(counting, [=] XGBOOST_DEVICE(std::size_t idx) {
+    auto v = batch.GetElement(idx).value;
+    if (!is_valid(v)) {
+      // discard the invalid elements.
+      return true;
+    }
+    // check that there's no inf in data.
+    return !std::isinf(v);
+  });
+  dh::XGBCachingDeviceAllocator<char> alloc;
+  // The default implementation in thrust optimizes any_of/none_of/all_of by using small
+  // intervals to early stop. But we expect all data to be valid here, using small
+  // intervals only decreases performance due to excessive kernel launch and stream
+  // synchronization.
+#if defined(XGBOOST_USE_CUDA)
+  auto valid = dh::Reduce(thrust::cuda::par(alloc), value_iter, value_iter + batch.Size(), true,
+                          thrust::logical_and<>{});
+#elif defined(XGBOOST_USE_HIP)
+  auto valid = dh::Reduce(thrust::hip::par(alloc), value_iter, value_iter + batch.Size(), true,
+                          thrust::logical_and<>{});
+#endif
  return valid;
 }
 };  // namespace data
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@ -213,7 +213,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
  // correct output position
  auto counting = thrust::make_counting_iterator(0llu);
  data::IsValidFunctor is_valid(missing);
-  bool valid = data::HasInfInData(batch, is_valid);
+  bool valid = data::NoInfInData(batch, is_valid);
  CHECK(valid) << error::InfInData();

  auto key_iter = dh::MakeTransformIterator<size_t>(
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@ -92,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    }
    auto batch_rows = num_rows();
    accumulated_rows += batch_rows;
-    dh::caching_device_vector<size_t> row_counts(batch_rows + 1, 0);
+    dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
    row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
                            return GetRowCounts(value, row_counts_span, get_device(), missing);
@ -163,7 +163,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 #endif

    auto rows = num_rows();
-    dh::caching_device_vector<size_t> row_counts(rows + 1, 0);
+    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
    Dispatch(proxy, [=](auto const& value) {
      return GetRowCounts(value, row_counts_span, get_device(), missing);
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@ -92,7 +92,7 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
 template <typename AdapterBatchT>
 size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
                        SparsePage* page) {
-  bool valid = HasInfInData(batch, IsValidFunctor{missing});
+  bool valid = NoInfInData(batch, IsValidFunctor{missing});
  CHECK(valid) << error::InfInData();

  page->offset.SetDevice(device);
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@ -67,7 +67,7 @@ class ColumnSplitHelper {
      const int32_t nid = nodes[node_in_set].nid;
      const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
      partition_builder_->AllocateForTask(task_id);
-      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
+      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
                                          (*row_set_collection_)[nid].begin, decision_bits_,
                                          missing_bits_);
    });
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@ -25,7 +25,6 @@
 #include "xgboost/linalg.h"            // for Constants, Vector

 namespace xgboost::tree {
-template <typename ExpandEntry>
 class HistEvaluator {
 private:
  struct NodeEntry {
@ -285,10 +284,42 @@ class HistEvaluator {
    return left_sum;
  }

+  /**
+   * @brief Gather the expand entries from all the workers.
+   * @param entries Local expand entries on this worker.
+   * @return Global expand entries gathered from all workers.
+   */
+  std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
+    auto const world = collective::GetWorldSize();
+    auto const rank = collective::GetRank();
+    auto const num_entries = entries.size();
+
+    // First, gather all the primitive fields.
+    std::vector<CPUExpandEntry> all_entries(num_entries * world);
+    std::vector<uint32_t> cat_bits;
+    std::vector<std::size_t> cat_bits_sizes;
+    for (std::size_t i = 0; i < num_entries; i++) {
+      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
+    }
+    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
+
+    // Gather all the cat_bits.
+    auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
+
+    common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
+      // Copy the cat_bits back into all expand entries.
+      all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
+      std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
+                  all_entries[i].split.cat_bits.begin());
+    });
+
+    return all_entries;
+  }
+
 public:
  void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
                      common::Span<FeatureType const> feature_types, const RegTree &tree,
-                      std::vector<ExpandEntry> *p_entries) {
+                      std::vector<CPUExpandEntry> *p_entries) {
    auto n_threads = ctx_->Threads();
    auto& entries = *p_entries;
    // All nodes are on the same level, so we can store the shared ptr.
@ -306,7 +337,7 @@ class HistEvaluator {
      return features[nidx_in_set]->Size();
    }, grain_size);

-    std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
+    std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
    for (size_t i = 0; i < entries.size(); ++i) {
      for (decltype(n_threads) j = 0; j < n_threads; ++j) {
        tloc_candidates[i * n_threads + j] = entries[i];
@ -365,22 +396,18 @@ class HistEvaluator {
    if (is_col_split_) {
      // With column-wise data split, we gather the best splits from all the workers and update the
      // expand entries accordingly.
-      auto const world = collective::GetWorldSize();
-      auto const rank = collective::GetRank();
-      auto const num_entries = entries.size();
-      std::vector<ExpandEntry> buffer{num_entries * world};
-      std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
-      collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
-      for (auto worker = 0; worker < world; ++worker) {
+      auto all_entries = Allgather(entries);
+      for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
-          entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
+          entries[nidx_in_set].split.Update(
+              all_entries[worker * entries.size() + nidx_in_set].split);
        }
      }
    }
  }

  // Add splits to tree, handles all statistic
-  void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
+  void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
    auto evaluator = tree_evaluator_.GetEvaluator();
    RegTree &tree = *p_tree;

@ -465,6 +492,7 @@ class HistMultiEvaluator {
  FeatureInteractionConstraintHost interaction_constraints_;
  std::shared_ptr<common::ColumnSampler> column_sampler_;
  Context const *ctx_;
+  bool is_col_split_{false};

 private:
  static double MultiCalcSplitGain(TrainParam const &param,
@ -543,6 +571,57 @@ class HistMultiEvaluator {
    return false;
  }

+  /**
+   * @brief Gather the expand entries from all the workers.
+   * @param entries Local expand entries on this worker.
+   * @return Global expand entries gathered from all workers.
+   */
+  std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
+    auto const world = collective::GetWorldSize();
+    auto const rank = collective::GetRank();
+    auto const num_entries = entries.size();
+
+    // First, gather all the primitive fields.
+    std::vector<MultiExpandEntry> all_entries(num_entries * world);
+    std::vector<uint32_t> cat_bits;
+    std::vector<std::size_t> cat_bits_sizes;
+    std::vector<GradientPairPrecise> gradients;
+    for (std::size_t i = 0; i < num_entries; i++) {
+      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
+                                                         &gradients);
+    }
+    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
+
+    // Gather all the cat_bits.
+    auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
+
+    // Gather all the gradients.
+    auto const num_gradients = gradients.size();
+    std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
+    std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
+    collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
+
+    auto const total_entries = num_entries * world;
+    auto const gradients_per_entry = num_gradients / num_entries;
+    auto const gradients_per_side = gradients_per_entry / 2;
+    common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
+      // Copy the cat_bits back into all expand entries.
+      all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
+      std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
+                  gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
+
+      // Copy the gradients back into all expand entries.
+      all_entries[i].split.left_sum.resize(gradients_per_side);
+      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
+                  all_entries[i].split.left_sum.begin());
+      all_entries[i].split.right_sum.resize(gradients_per_side);
+      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
+                  gradients_per_side, all_entries[i].split.right_sum.begin());
+    });
+
+    return all_entries;
+  }
+
 public:
  void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
                      common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@ -597,6 +676,18 @@ class HistMultiEvaluator {
        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
      }
    }
+
+    if (is_col_split_) {
+      // With column-wise data split, we gather the best splits from all the workers and update the
+      // expand entries accordingly.
+      auto all_entries = Allgather(entries);
+      for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
+        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+          entries[nidx_in_set].split.Update(
+              all_entries[worker * entries.size() + nidx_in_set].split);
+        }
+      }
+    }
  }

  linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
@ -660,7 +751,10 @@ class HistMultiEvaluator {

  explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
                              std::shared_ptr<common::ColumnSampler> sampler)
-      : param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
+      : param_{param},
+        column_sampler_{std::move(sampler)},
+        ctx_{ctx},
+        is_col_split_{info.IsColumnSplit()} {
    interaction_constraints_.Configure(*param, info.num_col_);
    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
                          param_->colsample_bynode, param_->colsample_bylevel,
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@ -70,6 +70,22 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
    os << "split:\n" << e.split << std::endl;
    return os;
  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits into a vector.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   */
+  void CopyAndCollect(CPUExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
+                      std::vector<std::size_t>* cat_bits_sizes) {
+    nid = that.nid;
+    depth = that.depth;
+    split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes);
+  }
 };

 struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
@ -119,6 +135,24 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
    os << "]\n";
    return os;
  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits and gradients into vectors.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   * @param collected_gradients The vector to collect gradients
+   */
+  void CopyAndCollect(MultiExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
+                      std::vector<std::size_t>* cat_bits_sizes,
+                      std::vector<GradientPairPrecise>* collected_gradients) {
+    nid = that.nid;
+    depth = that.depth;
+    split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes, collected_gradients);
+  }
 };
 }  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
--- a/src/tree/param.h
+++ b/src/tree/param.h
@ -419,6 +419,60 @@ struct SplitEntryContainer {
       << "right_sum: " << s.right_sum << std::endl;
    return os;
  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits into a vector.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   */
+  void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
+                      std::vector<uint32_t> *collected_cat_bits,
+                      std::vector<std::size_t> *cat_bits_sizes) {
+    loss_chg = that.loss_chg;
+    sindex = that.sindex;
+    split_value = that.split_value;
+    is_cat = that.is_cat;
+    static_assert(std::is_trivially_copyable_v<GradientT>);
+    left_sum = that.left_sum;
+    right_sum = that.right_sum;
+    collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
+                               that.cat_bits.cend());
+    cat_bits_sizes->emplace_back(that.cat_bits.size());
+  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits and gradient sums into vectors.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   * @param collected_gradients The vector to collect gradients
+   */
+  template <typename G>
+  void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
+                      std::vector<uint32_t> *collected_cat_bits,
+                      std::vector<std::size_t> *cat_bits_sizes,
+                      std::vector<G> *collected_gradients) {
+    loss_chg = that.loss_chg;
+    sindex = that.sindex;
+    split_value = that.split_value;
+    is_cat = that.is_cat;
+    collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
+                               that.cat_bits.cend());
+    cat_bits_sizes->emplace_back(that.cat_bits.size());
+    static_assert(!std::is_trivially_copyable_v<GradientT>);
+    collected_gradients->insert(collected_gradients->end(), that.left_sum.cbegin(),
+                                that.left_sum.cend());
+    collected_gradients->insert(collected_gradients->end(), that.right_sum.cbegin(),
+                                that.right_sum.cend());
+  }
+
  /*!\return feature index to split on */
  [[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
  /*!\return whether missing value goes to left branch */
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@ -44,7 +44,7 @@ class GloablApproxBuilder {
 protected:
  TrainParam const *param_;
  std::shared_ptr<common::ColumnSampler> col_sampler_;
-  HistEvaluator<CPUExpandEntry> evaluator_;
+  HistEvaluator evaluator_;
  HistogramBuilder<CPUExpandEntry> histogram_builder_;
  Context const *ctx_;
  ObjInfo const *const task_;
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@ -13,6 +13,7 @@
 #include <utility>                           // for move, swap
 #include <vector>                            // for vector

+#include "../collective/aggregator.h"        // for GlobalSum
 #include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
 #include "../collective/communicator.h"      // for Operation
 #include "../common/hist_util.h"             // for HistogramCuts, HistCollection
@ -200,8 +201,8 @@ class MultiTargetHistBuilder {
      }
    }
    CHECK(root_sum.CContiguous());
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
+    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
+                          root_sum.Size() * 2);

    std::vector<MultiExpandEntry> nodes{best};
    std::size_t i = 0;
@ -335,7 +336,7 @@ class HistBuilder {
  common::Monitor *monitor_;
  TrainParam const *param_;
  std::shared_ptr<common::ColumnSampler> col_sampler_;
-  std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
+  std::unique_ptr<HistEvaluator> evaluator_;
  std::vector<CommonRowPartitioner> partitioner_;

  // back pointers to tree and data matrix
@ -354,7 +355,7 @@ class HistBuilder {
      : monitor_{monitor},
        param_{param},
        col_sampler_{std::move(column_sampler)},
-        evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
+        evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
                                                                   col_sampler_)},
        p_last_fmat_(fmat),
        histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
@ -395,8 +396,7 @@ class HistBuilder {
    }
    histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
                              collective::IsDistributed(), fmat->Info().IsColumnSplit());
-    evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
-                                                                 col_sampler_);
+    evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
    p_last_tree_ = p_tree;
    monitor_->Stop(__func__);
  }
@ -455,8 +455,7 @@ class HistBuilder {
        for (auto const &grad : gpair_h) {
          grad_stat.Add(grad.GetGrad(), grad.GetHess());
        }
-        collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
-                                                           2);
+        collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
      }

      auto weight = evaluator_->InitRoot(GradStats{grad_stat});
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@ -20,7 +20,7 @@ namespace xgboost::tree {

 DMLC_REGISTRY_FILE_TAG(updater_refresh);

-/*! \brief pruner that prunes a tree after growing finishs */
+/*! \brief pruner that prunes a tree after growing finishes */
 class TreeRefresher : public TreeUpdater {
 public:
  explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}
--- a/tests/buildkite/build-jvm-packages.sh
+++ b/tests/buildkite/build-jvm-packages.sh
@ -4,11 +4,18 @@ set -euo pipefail

 source tests/buildkite/conftest.sh

-echo "--- Build XGBoost JVM packages"
+echo "--- Build XGBoost JVM packages scala 2.12"
 tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
  ${SPARK_VERSION}

+
+echo "--- Build XGBoost JVM packages scala 2.13"
+
+tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
+  ${SPARK_VERSION} "" "" "true"
+
 echo "--- Stash XGBoost4J JARs"
 buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
+buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@ -25,7 +25,7 @@ set -x
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
 RAPIDS_VERSION=23.02
-SPARK_VERSION=3.1.1
+SPARK_VERSION=3.4.0
 JDK_VERSION=8

 if [[ -z ${BUILDKITE:-} ]]
--- a/tests/ci_build/Dockerfile.clang_tidy
+++ b/tests/ci_build/Dockerfile.clang_tidy
@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu22.04
 ARG CUDA_VERSION_ARG

 # Environment
@ -7,22 +7,21 @@ ENV DEBIAN_FRONTEND noninteractive

 # Install all basic requirements
 RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
    apt-get update && \
-    apt-get install -y tar unzip wget git build-essential python3 python3-pip software-properties-common \
+    apt-get install -y wget git python3 python3-pip software-properties-common \
                       apt-transport-https ca-certificates gnupg-agent && \
-    wget -nv -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-    add-apt-repository -u 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main' && \
-    apt-get update && \
    apt-get install -y llvm-15 clang-tidy-15 clang-15 libomp-15-dev && \
-    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    apt-get install -y cmake

 # Set default clang-tidy version
 RUN \
    update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-15 100 && \
    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 100

+RUN \
+    apt-get install libgtest-dev libgmock-dev -y
+
 # Install Python packages
 RUN \
    pip3 install pyyaml
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
 ARG CUDA_VERSION_ARG
 ARG RAPIDS_VERSION_ARG

@ -9,7 +9,7 @@ SHELL ["/bin/bash", "-c"]   # Use Bash as shell

 # Install all basic requirements
 RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
    apt-get update && \
    apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
    # Python
@ -25,7 +25,7 @@ RUN \
        python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
        dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
        numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
-        pyspark cloudpickle cuda-python && \
+        pyspark>=3.4.0 cloudpickle cuda-python && \
    mamba clean --all && \
    conda run --no-capture-output -n gpu_test pip install buildkite-test-collector

--- a/tests/ci_build/Dockerfile.gpu_jvm
+++ b/tests/ci_build/Dockerfile.gpu_jvm
@ -1,53 +0,0 @@
-ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu16.04
-ARG CUDA_VERSION_ARG
-ARG JDK_VERSION=8
-ARG SPARK_VERSION=3.0.0
-
-# Environment
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install all basic requirements
-RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub && \
-    apt-get update && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
-    # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
-    bash conda.sh -b -p /opt/mambaforge && \
-    /opt/mambaforge/bin/pip install awscli && \
-    # Maven
-    wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
-    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
-    ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
-    # Spark
-    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
-    tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
-    ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
-
-ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH
-
-# Install Python packages
-RUN \
-    pip install numpy scipy pandas scikit-learn
-
-ENV GOSU_VERSION 1.10
-
-# Install lightweight sudo (not bound to TTY)
-RUN set -ex; \
-    wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
-    chmod +x /usr/local/bin/gosu && \
-    gosu nobody true
-
-# Set default JDK version
-RUN update-java-alternatives -v -s java-1.$JDK_VERSION.0-openjdk-amd64
-
-# Default entry-point to use if running locally
-# It will preserve attributes of created files
-COPY entrypoint.sh /scripts/
-
-WORKDIR /workspace
-ENTRYPOINT ["/scripts/entrypoint.sh"]
--- a/tests/ci_build/Dockerfile.jvm_cross
+++ b/tests/ci_build/Dockerfile.jvm_cross
@ -20,10 +20,14 @@ RUN \
    wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
    ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
-    # Spark
-    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
-    tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
-    ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
+    # Spark with scala 2.12
+    mkdir -p /opt/spark-scala-2.12 && \
+    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \
+    tar xvf spark-$SPARK_VERSION-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \
+    # Spark with scala 2.13
+    mkdir -p /opt/spark-scala-2.13 && \
+    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz && \
+    tar xvf spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13

 ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH

--- a/tests/ci_build/build_jvm_packages.sh
+++ b/tests/ci_build/build_jvm_packages.sh
@ -6,6 +6,7 @@ set -x
 spark_version=$1
 use_cuda=$2
 gpu_arch=$3
+use_scala213=$4

 gpu_options=""
 if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then
@ -22,7 +23,13 @@ export RABIT_MOCK=ON
 if [ "x$gpu_arch" != "x" ]; then
  export GPU_ARCH_FLAG=$gpu_arch
 fi
-mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
+
+mvn_profile_string=""
+if [ "x$use_scala213" != "x" ]; then
+  export mvn_profile_string="-Pdefault,scala-2.13"
+fi
+
+mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options

 set +x
 set +e
--- a/tests/ci_build/conda_env/aarch64_test.yml
+++ b/tests/ci_build/conda_env/aarch64_test.yml
@ -28,7 +28,7 @@ dependencies:
 - llvmlite
 - cffi
 - pyarrow
- pyspark
+- pyspark>=3.4.0
 - cloudpickle
 - pip:
  - awscli
--- a/tests/ci_build/conda_env/linux_cpu_test.yml
+++ b/tests/ci_build/conda_env/linux_cpu_test.yml
@ -38,8 +38,6 @@ dependencies:
 - protobuf
 - cloudpickle
 - modin
-# TODO: Replace it with pyspark>=3.4 once 3.4 released.
-# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
- pyspark>=3.3.1
+- pyspark>=3.4.0
 - pip:
  - datatable
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@ -35,7 +35,7 @@ dependencies:
 - py-ubjson
 - cffi
 - pyarrow
- pyspark
+- pyspark>=3.4.0
 - cloudpickle
 - pip:
  - sphinx_rtd_theme
--- a/tests/ci_build/conda_env/python_lint.yml
+++ b/tests/ci_build/conda_env/python_lint.yml
@ -19,6 +19,4 @@ dependencies:
 - pytest
 - hypothesis
 - hatchling
- pip:
-  # TODO: Replace it with pyspark>=3.4 once 3.4 released.
-  - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
+- pyspark>=3.4.0
--- a/tests/ci_build/test_jvm_cross.sh
+++ b/tests/ci_build/test_jvm_cross.sh
@ -6,14 +6,24 @@ set -x
 # Initialize local Maven repository
 ./tests/ci_build/initialize_maven.sh

-# Get version number of XGBoost4J and other auxiliary information
 cd jvm-packages
+jvm_packages_dir=`pwd`
+# Get version number of XGBoost4J and other auxiliary information
 xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
 maven_compiler_source=$(mvn help:evaluate -Dexpression=maven.compiler.source -q -DforceStdout)
 maven_compiler_target=$(mvn help:evaluate -Dexpression=maven.compiler.target -q -DforceStdout)
 spark_version=$(mvn help:evaluate -Dexpression=spark.version -q -DforceStdout)
-scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
-scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout)
+
+if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
+  cd $jvm_packages_dir/xgboost4j-tester
+  python3 get_iris.py
+  cd $jvm_packages_dir
+fi
+
+# including maven profiles for different scala versions: 2.12 is the default at the moment.
+for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
+  scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
+  scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)

  # Install XGBoost4J JAR into local Maven repository
  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
@ -24,19 +34,28 @@ mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/targe
  cd xgboost4j-tester
  # Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
  python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
-# Run unit tests with XGBoost4J
-mvn --no-transfer-progress package
-
+  # Build package and unit tests with XGBoost4J
+  mvn --no-transfer-progress clean package
+  xgboost4j_tester_jar="$jvm_packages_dir/xgboost4j-tester/target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar"
  # Run integration tests with XGBoost4J
-java -jar ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar
+  java -jar $xgboost4j_tester_jar

  # Run integration tests with XGBoost4J-Spark
-if [ ! -z "$RUN_INTEGRATION_TEST" ]
-then
-  python3 get_iris.py
-  spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv
-  spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model
+  if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
+    # Changing directory so that we do not mix code and resulting files
+    cd target
+    if [[ "$scala_binary_version" == "2.12" ]]; then
+       /opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
+       /opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
+    elif [[ "$scala_binary_version" == "2.13" ]]; then
+      /opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
+      /opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
+    else
+      echo "Unexpected scala version: $scala_version ($scala_binary_version)."
    fi
+  fi
+  cd $jvm_packages_dir
+done

 set +x
 set +e
--- a/tests/ci_build/tidy.py
+++ b/tests/ci_build/tidy.py
@ -41,7 +41,7 @@ class ClangTidy(object):
    def __init__(self, args):
        self.cpp_lint = args.cpp
        self.cuda_lint = args.cuda
-        self.use_dmlc_gtest = args.use_dmlc_gtest
+        self.use_dmlc_gtest: bool = args.use_dmlc_gtest
        self.cuda_archs = args.cuda_archs.copy() if args.cuda_archs else []

        if args.tidy_version:
@ -202,6 +202,7 @@ class ClangTidy(object):
        cdb_file = os.path.join(self.cdb_path, 'compile_commands.json')
        with open(cdb_file, 'r') as fd:
            self.compile_commands = json.load(fd)
+
        tidy_file = os.path.join(self.root_path, '.clang-tidy')
        with open(tidy_file) as fd:
            self.clang_tidy = yaml.safe_load(fd)
@ -276,16 +277,24 @@ right keywords?
    print('clang-tidy is working.')


-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Run clang-tidy.')
-    parser.add_argument('--cpp', type=int, default=1)
-    parser.add_argument('--tidy-version', type=int, default=None,
-                        help='Specify the version of preferred clang-tidy.')
-    parser.add_argument('--cuda', type=int, default=1)
-    parser.add_argument('--use-dmlc-gtest', type=int, default=1,
-                        help='Whether to use gtest bundled in dmlc-core.')
-    parser.add_argument('--cuda-archs', action='append',
-                        help='List of CUDA archs to build')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run clang-tidy.")
+    parser.add_argument("--cpp", type=int, default=1)
+    parser.add_argument(
+        "--tidy-version",
+        type=int,
+        default=None,
+        help="Specify the version of preferred clang-tidy.",
+    )
+    parser.add_argument("--cuda", type=int, default=1)
+    parser.add_argument(
+        "--use-dmlc-gtest",
+        action="store_true",
+        help="Whether to use gtest bundled in dmlc-core.",
+    )
+    parser.add_argument(
+        "--cuda-archs", action="append", help="List of CUDA archs to build"
+    )
    args = parser.parse_args()

    test_tidy(args)
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@ -497,6 +497,77 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
  }
 }

+namespace {
+auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
+#endif
+  auto n = n_samples * n_features;
+  std::vector<float> x;
+  x.resize(n);
+
+  std::iota(x.begin(), x.end(), 0);
+  std::int32_t c{0};
+  float missing = n_samples * n_features;
+  for (std::size_t i = 0; i < x.size(); ++i) {
+    if (i % 5 == 0) {
+      x[i] = missing;
+      c++;
+    }
+  }
+  thrust::device_vector<float> d_x;
+  d_x = x;
+
+  auto n_invalids = n / 10 * 2 + 1;
+  auto is_valid = data::IsValidFunctor{missing};
+  return std::tuple{x, d_x, n_invalids, is_valid};
+}
+
+void TestGetColumnSize(std::size_t n_samples) {
+  auto ctx = MakeCUDACtx(0);
+  bst_feature_t n_features = 12;
+  [[maybe_unused]] auto [x, d_x, n_invalids, is_valid] = MakeData(&ctx, n_samples, n_features);
+
+  auto adapter = AdapterFromData(d_x, n_samples, n_features);
+  auto batch = adapter.Value();
+
+  auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
+      thrust::make_counting_iterator(0llu),
+      [=] __device__(std::size_t idx) { return batch.GetElement(idx); });
+
+  dh::caching_device_vector<std::size_t> column_sizes_scan;
+  column_sizes_scan.resize(n_features + 1);
+  std::vector<std::size_t> h_column_size(column_sizes_scan.size());
+  std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
+
+  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
+      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
+
+  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
+      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
+  ASSERT_EQ(h_column_size, h_column_size_1);
+
+  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
+      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
+  ASSERT_EQ(h_column_size, h_column_size_1);
+
+  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
+      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
+  ASSERT_EQ(h_column_size, h_column_size_1);
+}
+}  // namespace
+
+TEST(HistUtil, GetColumnSize) {
+  bst_row_t n_samples = 4096;
+  TestGetColumnSize(n_samples);
+}
+
 // Check sketching from adapter or DMatrix results in the same answer
 // Consistency here is useful for testing and user experience
 TEST(HistUtil, SketchingEquivalent) {
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@ -56,7 +56,7 @@ void TestSketchUnique(float sparsity) {
        thrust::make_counting_iterator(0llu),
        [=] __device__(size_t idx) { return batch.GetElement(idx); });
    auto end = kCols * kRows;
-    detail::GetColumnSizesScan(0, kCols, n_cuts, batch_iter, is_valid, 0, end,
+    detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
                               &cut_sizes_scan, &column_sizes_scan);
    auto const& cut_sizes = cut_sizes_scan.HostVector();
    ASSERT_LE(sketch.Data().size(), cut_sizes.back());
--- a/tests/cpp/common/test_span.cc
+++ b/tests/cpp/common/test_span.cc
@ -1,15 +1,16 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2023, XGBoost contributors
 */
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <xgboost/span.h>
 #include "test_span.h"

-namespace xgboost {
-namespace common {
+#include <gtest/gtest.h>
+#include <xgboost/span.h>

+#include <vector>
+
+#include "../../../src/common/transform_iterator.h"  // for MakeIndexTransformIter
+
+namespace xgboost::common {
 TEST(Span, TestStatus) {
  int status = 1;
  TestTestStatus {&status}();
@ -526,5 +527,17 @@ TEST(SpanDeathTest, Empty) {
  Span<float> s{data.data(), static_cast<Span<float>::index_type>(0)};
  EXPECT_DEATH(s[0], "");  // not ok to use it.
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(IterSpan, Basic) {
+  auto iter = common::MakeIndexTransformIter([](std::size_t i) { return i; });
+  std::size_t n = 13;
+  auto span = IterSpan{iter, n};
+  ASSERT_EQ(span.size(), n);
+  for (std::size_t i = 0; i < n; ++i) {
+    ASSERT_EQ(span[i], i);
+  }
+  ASSERT_EQ(span.subspan(1).size(), n - 1);
+  ASSERT_EQ(span.subspan(1)[0], 1);
+  ASSERT_EQ(span.subspan(1, 2)[1], 2);
+}
+}  // namespace xgboost::common
--- a/tests/cpp/data/test_device_adapter.cu
+++ b/tests/cpp/data/test_device_adapter.cu
@ -62,3 +62,22 @@ void TestCudfAdapter()
 TEST(DeviceAdapter, CudfAdapter) {
  TestCudfAdapter();
 }
+
+namespace xgboost::data {
+TEST(DeviceAdapter, GetRowCounts) {
+  auto ctx = MakeCUDACtx(0);
+
+  for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
+    HostDeviceVector<float> storage;
+    auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
+                       .Device(ctx.gpu_id)
+                       .GenerateArrayInterface(&storage);
+    auto adapter = CupyAdapter{str_arr};
+    HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
+    offset.SetDevice(ctx.gpu_id);
+    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
+                                std::numeric_limits<float>::quiet_NaN());
+    ASSERT_EQ(rstride, n_features);
+  }
+}
+}  // namespace xgboost::data
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -23,6 +23,7 @@

 #include "../../src/collective/communicator-inl.h"
 #include "../../src/common/common.h"
+#include "../../src/common/threading_utils.h"
 #include "../../src/data/array_interface.h"
 #include "filesystem.h"  // dmlc::TemporaryDirectory
 #include "xgboost/linalg.h"
@ -388,6 +389,23 @@ inline Context CreateEmptyGenericParam(int gpu_id) {
  return tparam;
 }

+inline std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(
+    std::size_t rows, bst_target_t n_targets = 1) {
+  auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows * n_targets);
+  auto& h_gradients = p_gradients->HostVector();
+
+  xgboost::SimpleLCG gen;
+  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
+
+  for (std::size_t i = 0; i < rows * n_targets; ++i) {
+    auto grad = dist(&gen);
+    auto hess = dist(&gen);
+    h_gradients[i] = GradientPair{grad, hess};
+  }
+
+  return p_gradients;
+}
+
 /**
 * \brief Make a context that uses CUDA.
 */
@ -509,11 +527,7 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
    xgboost::collective::Finalize();
  };
 #if defined(_OPENMP)
-#pragma omp parallel num_threads(world_size)
-  {
-    auto rank = omp_get_thread_num();
-    run(rank);
-  }
+  common::ParallelFor(world_size, world_size, run);
 #else
  std::vector<std::thread> threads;
  for (auto rank = 0; rank < world_size; rank++) {
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@ -13,6 +13,7 @@

 #include "../../../plugin/federated/federated_server.h"
 #include "../../../src/collective/communicator-inl.h"
+#include "../../../src/common/threading_utils.h"

 namespace xgboost {

@ -75,11 +76,7 @@ void RunWithFederatedCommunicator(int32_t world_size, std::string const& server_
    xgboost::collective::Finalize();
  };
 #if defined(_OPENMP)
-#pragma omp parallel num_threads(world_size)
-  {
-    auto rank = omp_get_thread_num();
-    run(rank);
-  }
+  common::ParallelFor(world_size, world_size, run);
 #else
  std::vector<std::thread> threads;
  for (auto rank = 0; rank < world_size; rank++) {
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/test_federated_learner.cc
@ -15,9 +15,9 @@

 namespace xgboost {
 namespace {
-auto MakeModel(std::string objective, std::shared_ptr<DMatrix> dmat) {
+auto MakeModel(std::string tree_method, std::string objective, std::shared_ptr<DMatrix> dmat) {
  std::unique_ptr<Learner> learner{Learner::Create({dmat})};
-  learner->SetParam("tree_method", "approx");
+  learner->SetParam("tree_method", tree_method);
  learner->SetParam("objective", objective);
  if (objective.find("quantile") != std::string::npos) {
    learner->SetParam("quantile_alpha", "0.5");
@ -35,7 +35,7 @@ auto MakeModel(std::string objective, std::shared_ptr<DMatrix> dmat) {
 }

 void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json expected_model,
-                     std::string objective) {
+                     std::string tree_method, std::string objective) {
  auto const world_size = collective::GetWorldSize();
  auto const rank = collective::GetRank();
  std::shared_ptr<DMatrix> dmat{RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(rank == 0)};
@ -61,7 +61,7 @@ void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json e
  }
  std::shared_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};

-  auto model = MakeModel(objective, sliced);
+  auto model = MakeModel(tree_method, objective, sliced);
  auto base_score = GetBaseScore(model);
  ASSERT_EQ(base_score, expected_base_score);
  ASSERT_EQ(model, expected_model);
@ -76,7 +76,7 @@ class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
  void SetUp() override { server_ = std::make_unique<ServerForTest>(kWorldSize); }
  void TearDown() override { server_.reset(nullptr); }

-  void Run(std::string objective) {
+  void Run(std::string tree_method, std::string objective) {
    static auto constexpr kRows{16};
    static auto constexpr kCols{16};

@ -99,17 +99,22 @@ class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
      }
    }

-    auto model = MakeModel(objective, dmat);
+    auto model = MakeModel(tree_method, objective, dmat);
    auto score = GetBaseScore(model);

    RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyObjective, kRows, kCols,
-                                 score, model, objective);
+                                 score, model, tree_method, objective);
  }
 };

-TEST_P(FederatedLearnerTest, Objective) {
+TEST_P(FederatedLearnerTest, Approx) {
  std::string objective = GetParam();
-  this->Run(objective);
+  this->Run("approx", objective);
+}
+
+TEST_P(FederatedLearnerTest, Hist) {
+  std::string objective = GetParam();
+  this->Run("hist", objective);
 }

 INSTANTIATE_TEST_SUITE_P(FederatedLearnerObjective, FederatedLearnerTest,
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@ -33,7 +33,7 @@ void TestEvaluateSplits(bool force_read_by_column) {

  auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix();

-  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
+  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
  common::HistCollection hist;
  std::vector<GradientPair> row_gpairs = {
      {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f},  {2.27f, 0.28f},
@ -167,7 +167,7 @@ TEST(HistEvaluator, Apply) {
  param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
  auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
  auto sampler = std::make_shared<common::ColumnSampler>();
-  auto evaluator_ = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
+  auto evaluator_ = HistEvaluator{&ctx, &param, dmat->Info(), sampler};

  CPUExpandEntry entry{0, 0};
  entry.split.loss_chg = 10.0f;
@ -195,7 +195,7 @@ TEST_F(TestPartitionBasedSplit, CPUHist) {
  // check the evaluator is returning the optimal split
  std::vector<FeatureType> ft{FeatureType::kCategorical};
  auto sampler = std::make_shared<common::ColumnSampler>();
-  HistEvaluator<CPUExpandEntry> evaluator{&ctx, &param_, info_, sampler};
+  HistEvaluator evaluator{&ctx, &param_, info_, sampler};
  evaluator.InitRoot(GradStats{total_gpair_});
  RegTree tree;
  std::vector<CPUExpandEntry> entries(1);
@ -225,7 +225,7 @@ auto CompareOneHotAndPartition(bool onehot) {
      RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix();

  auto sampler = std::make_shared<common::ColumnSampler>();
-  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
+  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
  std::vector<CPUExpandEntry> entries(1);

  for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>(&ctx, {32, param.sparse_threshold})) {
@ -276,7 +276,7 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
  info.num_col_ = 1;
  info.feature_types = {FeatureType::kCategorical};
  Context ctx;
-  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param_, info, sampler};
+  auto evaluator = HistEvaluator{&ctx, &param_, info, sampler};
  evaluator.InitRoot(GradStats{parent_sum_});

  std::vector<CPUExpandEntry> entries(1);
--- a/tests/cpp/tree/test_constraints.cc
+++ b/tests/cpp/tree/test_constraints.cc
@ -79,7 +79,7 @@ TEST(CPUMonoConstraint, Basic) {
  auto Xy = RandomDataGenerator{kRows, kCols, 0.0}.GenerateDMatrix(true);
  auto sampler = std::make_shared<common::ColumnSampler>();

-  HistEvaluator<CPUExpandEntry> evalutor{&ctx, &param, Xy->Info(), sampler};
+  HistEvaluator evalutor{&ctx, &param, Xy->Info(), sampler};
  evalutor.InitRoot(GradStats{2.0, 2.0});

  SplitEntry split;
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@ -9,28 +9,20 @@
 #include "../helpers.h"

 namespace xgboost::tree {
-std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols){
+std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols,
+                                         bool categorical = false) {
+  if (categorical) {
+    std::vector<FeatureType> ft(cols);
+    for (size_t i = 0; i < ft.size(); ++i) {
+      ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+    }
+    return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix();
+  } else {
    return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
  }
-
-std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(std::size_t rows) {
-  auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows);
-  auto& h_gradients = p_gradients->HostVector();
-
-  xgboost::SimpleLCG gen;
-  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
-
-  for (std::size_t i = 0; i < rows; ++i) {
-    auto grad = dist(&gen);
-    auto hess = dist(&gen);
-    h_gradients[i] = GradientPair{grad, hess};
 }

-  return p_gradients;
-}
-
-TEST(GrowHistMaker, InteractionConstraint)
-{
+TEST(GrowHistMaker, InteractionConstraint) {
  auto constexpr kRows = 32;
  auto constexpr kCols = 16;
  auto p_dmat = GenerateDMatrix(kRows, kCols);
@ -74,8 +66,9 @@ TEST(GrowHistMaker, InteractionConstraint)
 }

 namespace {
-void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_tree) {
-  auto p_dmat = GenerateDMatrix(rows, cols);
+void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
+                       RegTree const& expected_tree) {
+  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
  auto p_gradients = GenerateGradients(rows);
  Context ctx;
  ObjInfo task{ObjInfo::kRegression};
@ -90,27 +83,21 @@ void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_t
  param.Init(Args{});
  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});

-  ASSERT_EQ(tree.NumExtraNodes(), 10);
-  ASSERT_EQ(tree[0].SplitIndex(), 1);
-
-  ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
-  ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
-
-  FeatureMap fmap;
-  auto json = tree.DumpModel(fmap, false, "json");
-  auto expected_json = expected_tree.DumpModel(fmap, false, "json");
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
  ASSERT_EQ(json, expected_json);
 }
-}  // anonymous namespace

-TEST(GrowHistMaker, ColumnSplit) {
+void TestColumnSplit(bool categorical) {
  auto constexpr kRows = 32;
  auto constexpr kCols = 16;

  RegTree expected_tree{1u, kCols};
  ObjInfo task{ObjInfo::kRegression};
  {
-    auto p_dmat = GenerateDMatrix(kRows, kCols);
+    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
    auto p_gradients = GenerateGradients(kRows);
    Context ctx;
    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
@ -121,6 +108,12 @@ TEST(GrowHistMaker, ColumnSplit) {
  }

  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit, kRows, kCols, std::cref(expected_tree));
+  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, categorical,
+                              std::cref(expected_tree));
 }
+}  // anonymous namespace
+
+TEST(GrowHistMaker, ColumnSplitNumerical) { TestColumnSplit(false); }
+
+TEST(GrowHistMaker, ColumnSplitCategorical) { TestColumnSplit(true); }
 }  // namespace xgboost::tree
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@ -113,7 +113,6 @@ void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,

  for (auto const& page : Xy->GetBatches<SparsePage>()) {
    GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
-    bst_feature_t const split_ind = 0;
    common::ColumnMatrix column_indices;
    column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
    {
@ -194,11 +193,65 @@ void TestColumnSplitPartitioner(bst_target_t n_targets) {

  auto constexpr kWorkers = 4;
  RunWithInMemoryCommunicator(kWorkers, VerifyColumnSplitPartitioner<ExpandEntry>, n_targets,
-                              n_samples, n_features, base_rowid, Xy, min_value, mid_value, mid_partitioner);
+                              n_samples, n_features, base_rowid, Xy, min_value, mid_value,
+                              mid_partitioner);
 }
 }  // anonymous namespace

 TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEntry>(1); }

 TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
+
+namespace {
+void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, bst_target_t n_targets,
+                       RegTree const& expected_tree) {
+  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
+  auto p_gradients = GenerateGradients(rows, n_targets);
+  Context ctx;
+  ObjInfo task{ObjInfo::kRegression};
+  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+
+  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+
+  RegTree tree{n_targets, cols};
+  TrainParam param;
+  param.Init(Args{});
+  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
+  ASSERT_EQ(json, expected_json);
+}
+
+void TestColumnSplit(bst_target_t n_targets) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  RegTree expected_tree{n_targets, kCols};
+  ObjInfo task{ObjInfo::kRegression};
+  {
+    auto Xy = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+    auto p_gradients = GenerateGradients(kRows, n_targets);
+    Context ctx;
+    std::unique_ptr<TreeUpdater> updater{
+        TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    TrainParam param;
+    param.Init(Args{});
+    updater->Update(&param, p_gradients.get(), Xy.get(), position, {&expected_tree});
+  }
+
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, n_targets,
+                              std::cref(expected_tree));
+}
+}  // anonymous namespace
+
+TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
+
+TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
+
 }  // namespace xgboost::tree