From acd363033e1127898a1e1fec7db0efc37fc2a98e Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 25 May 2023 14:26:38 -0700
Subject: [PATCH 01/19] Fix running MGPU gtests (#9200)

---
 src/collective/communicator.cu | 11 +++++++----
 src/collective/communicator.h  |  1 -
 2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/src/collective/communicator.cu b/src/collective/communicator.cu
index 0880741f9..8bd10382d 100644
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -12,19 +12,22 @@
 namespace xgboost {
 namespace collective {
 
-thread_local int Communicator::device_ordinal_{-1};
 thread_local std::unique_ptr<DeviceCommunicator> Communicator::device_communicator_{};
 
 void Communicator::Finalize() {
   communicator_->Shutdown();
   communicator_.reset(new NoOpCommunicator());
-  device_ordinal_ = -1;
   device_communicator_.reset(nullptr);
 }
 
 DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
-  if (!device_communicator_ || device_ordinal_ != device_ordinal) {
-    device_ordinal_ = device_ordinal;
+  thread_local auto old_device_ordinal = -1;
+  // If the number of GPUs changes, we need to re-initialize NCCL.
+  thread_local auto old_world_size = -1;
+  if (!device_communicator_ || device_ordinal != old_device_ordinal ||
+      communicator_->GetWorldSize() != old_world_size) {
+    old_device_ordinal = device_ordinal;
+    old_world_size = communicator_->GetWorldSize();
 #ifdef XGBOOST_USE_NCCL
     if (type_ != CommunicatorType::kFederated) {
       device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
diff --git a/src/collective/communicator.h b/src/collective/communicator.h
index 885a8d438..6cda5e47c 100644
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -229,7 +229,6 @@ class Communicator {
   static thread_local std::unique_ptr<Communicator> communicator_;
   static thread_local CommunicatorType type_;
 #if defined(XGBOOST_USE_CUDA)
-  static thread_local int device_ordinal_;
   static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif
 

From 5b69534b43043d14a5757f40b7ba82cb268af6ac Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 26 May 2023 01:56:05 -0700
Subject: [PATCH 02/19] Support column split in multi-target `hist` (#9171)

---
 src/collective/communicator-inl.h           |  42 +++++++
 src/common/partition_builder.h              |   4 +-
 src/common/quantile.cc                      |  22 ++--
 src/common/quantile.h                       |   3 +-
 src/data/data.cc                            |   3 +
 src/tree/hist/evaluate_splits.h             | 120 +++++++++++++++++---
 src/tree/hist/expand_entry.h                |  34 ++++++
 src/tree/param.h                            |  54 +++++++++
 src/tree/updater_approx.cc                  |   2 +-
 src/tree/updater_quantile_hist.cc           |  15 ++-
 tests/cpp/helpers.h                         |  24 +++-
 tests/cpp/plugin/helpers.h                  |   7 +-
 tests/cpp/plugin/test_federated_learner.cc  |  23 ++--
 tests/cpp/tree/hist/test_evaluate_splits.cc |  10 +-
 tests/cpp/tree/test_constraints.cc          |   2 +-
 tests/cpp/tree/test_histmaker.cc            |  61 +++++-----
 tests/cpp/tree/test_quantile_hist.cc        |  56 ++++++++-
 17 files changed, 386 insertions(+), 96 deletions(-)

diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index 702bda256..8ae96dd8a 100644
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -3,6 +3,7 @@
  */
 #pragma once
 #include <string>
+#include <vector>
 
 #include "communicator.h"
 
@@ -224,5 +225,46 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
   Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }
 
+template <typename T>
+struct AllgatherVResult {
+  std::vector<std::size_t> offsets;
+  std::vector<std::size_t> sizes;
+  std::vector<T> result;
+};
+
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * We assume each worker has the same number of inputs, but each input may be of a different size.
+ *
+ * @param inputs All the inputs from the local worker.
+ * @param sizes  Sizes of each input.
+ */
+template <typename T>
+inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
+                                      std::vector<std::size_t> const &sizes) {
+  auto num_inputs = sizes.size();
+
+  // Gather the sizes across all workers.
+  std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
+  std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
+  collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
+
+  // Calculate input offsets (std::exclusive_scan).
+  std::vector<std::size_t> offsets(all_sizes.size());
+  for (auto i = 1; i < offsets.size(); i++) {
+    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
+  }
+
+  // Gather all the inputs.
+  auto total_input_size = offsets.back() + all_sizes.back();
+  std::vector<T> all_inputs(total_input_size);
+  std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
+  // We cannot use allgather here, since each worker might have a different size.
+  Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
+
+  return {offsets, all_sizes, all_inputs};
+}
+
 }  // namespace collective
 }  // namespace xgboost
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index cdc8aa193..cfa2f6a58 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -209,7 +209,7 @@ class PartitionBuilder {
                 BitVector* decision_bits, BitVector* missing_bits) {
     common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
     std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
+    bst_feature_t fid = tree.SplitIndex(nid);
     bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
     auto node_cats = tree.NodeCats(nid);
     auto const& cut_values = gmat.cut.Values();
@@ -270,7 +270,7 @@ class PartitionBuilder {
     common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
     common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
     std::size_t nid = nodes[node_in_set].nid;
-    bool default_left = tree[nid].DefaultLeft();
+    bool default_left = tree.DefaultLeft(nid);
 
     auto pred = [&](auto ridx) {
       bool go_left = default_left;
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index a93184b95..390ce34d2 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -7,7 +7,6 @@
 #include <utility>
 
 #include "../collective/aggregator.h"
-#include "../collective/communicator-inl.h"
 #include "../data/adapter.h"
 #include "categorical.h"
 #include "hist_util.h"
@@ -143,6 +142,7 @@ struct QuantileAllreduce {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
+    MetaInfo const& info,
     std::vector<typename WQSketch::SummaryContainer> const &reduced,
     std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
     std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -168,7 +168,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
   std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);
 
   // Gather all column pointers
-  collective::Allreduce<collective::Operation::kSum>(sketches_scan.data(), sketches_scan.size());
+  collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size());
   for (int32_t i = 0; i < world; ++i) {
     size_t back = (i + 1) * (n_columns + 1) - 1;
     auto n_entries = sketches_scan.at(back);
@@ -196,7 +196,8 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 
   static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
                 "Unexpected size of sketch entry.");
-  collective::Allreduce<collective::Operation::kSum>(
+  collective::GlobalSum(
+      info,
       reinterpret_cast<float *>(global_sketches.data()),
       global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
 }
@@ -222,8 +223,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
   std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
   size_t feat_begin = rank * feature_ptr.size();  // pointer to current worker
   std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
-  collective::Allreduce<collective::Operation::kSum>(global_feat_ptrs.data(),
-                                                     global_feat_ptrs.size());
+  collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size());
 
   // move all categories into a flatten vector to prepare for allreduce
   size_t total = feature_ptr.back();
@@ -236,8 +236,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
   // indptr for indexing workers
   std::vector<size_t> global_worker_ptr(world_size + 1, 0);
   global_worker_ptr[rank + 1] = total;  // shift 1 to right for constructing the indptr
-  collective::Allreduce<collective::Operation::kSum>(global_worker_ptr.data(),
-                                                     global_worker_ptr.size());
+  collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size());
   std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
   // total number of categories in all workers with all features
   auto gtotal = global_worker_ptr.back();
@@ -249,8 +248,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
   CHECK_EQ(rank_size, total);
   std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
   // gather values from all workers.
-  collective::Allreduce<collective::Operation::kSum>(global_categories.data(),
-                                                     global_categories.size());
+  collective::GlobalSum(info, global_categories.data(), global_categories.size());
   QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
                                             categories_.size()};
   ParallelFor(categories_.size(), n_threads_, [&](auto fidx) {
@@ -323,7 +321,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
   std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);
 
   std::vector<typename WQSketch::Entry> global_sketches;
-  this->GatherSketchInfo(reduced, &worker_segments, &sketches_scan, &global_sketches);
+  this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches);
 
   std::vector<typename WQSketch::SummaryContainer> final_sketches(n_columns);
 
@@ -371,7 +369,9 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
     InvalidCategory();
   }
   auto &cut_values = cuts->cut_values_.HostVector();
-  auto max_cat = *std::max_element(categories.cbegin(), categories.cend());
+  // With column-wise data split, the categories may be empty.
+  auto max_cat =
+      categories.empty() ? 0.0f : *std::max_element(categories.cbegin(), categories.cend());
   CheckMaxCat(max_cat, categories.size());
   for (bst_cat_t i = 0; i <= AsCat(max_cat); ++i) {
     cut_values.push_back(i);
diff --git a/src/common/quantile.h b/src/common/quantile.h
index 0a82f7c90..dc0a4872a 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -822,7 +822,8 @@ class SketchContainerImpl {
     return group_ind;
   }
   // Gather sketches from all workers.
-  void GatherSketchInfo(std::vector<typename WQSketch::SummaryContainer> const &reduced,
+  void GatherSketchInfo(MetaInfo const& info,
+                        std::vector<typename WQSketch::SummaryContainer> const &reduced,
                         std::vector<bst_row_t> *p_worker_segments,
                         std::vector<bst_row_t> *p_sketches_scan,
                         std::vector<typename WQSketch::Entry> *p_global_sketches);
diff --git a/src/data/data.cc b/src/data/data.cc
index 1aedd6d92..f9886b2f0 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -698,6 +698,9 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
     this->feature_type_names = that.feature_type_names;
     auto &h_feature_types = feature_types.HostVector();
     LoadFeatureType(this->feature_type_names, &h_feature_types);
+  } else if (!that.feature_types.Empty()) {
+    this->feature_types.Resize(that.feature_types.Size());
+    this->feature_types.Copy(that.feature_types);
   }
   if (!that.feature_weights.Empty()) {
     this->feature_weights.Resize(that.feature_weights.Size());
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index ec1ce769f..c8321b48a 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -25,7 +25,6 @@
 #include "xgboost/linalg.h"            // for Constants, Vector
 
 namespace xgboost::tree {
-template <typename ExpandEntry>
 class HistEvaluator {
  private:
   struct NodeEntry {
@@ -285,10 +284,42 @@ class HistEvaluator {
     return left_sum;
   }
 
+  /**
+   * @brief Gather the expand entries from all the workers.
+   * @param entries Local expand entries on this worker.
+   * @return Global expand entries gathered from all workers.
+   */
+  std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
+    auto const world = collective::GetWorldSize();
+    auto const rank = collective::GetRank();
+    auto const num_entries = entries.size();
+
+    // First, gather all the primitive fields.
+    std::vector<CPUExpandEntry> all_entries(num_entries * world);
+    std::vector<uint32_t> cat_bits;
+    std::vector<std::size_t> cat_bits_sizes;
+    for (auto i = 0; i < num_entries; i++) {
+      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
+    }
+    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
+
+    // Gather all the cat_bits.
+    auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
+
+    common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
+      // Copy the cat_bits back into all expand entries.
+      all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
+      std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
+                  all_entries[i].split.cat_bits.begin());
+    });
+
+    return all_entries;
+  }
+
  public:
   void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
                       common::Span<FeatureType const> feature_types, const RegTree &tree,
-                      std::vector<ExpandEntry> *p_entries) {
+                      std::vector<CPUExpandEntry> *p_entries) {
     auto n_threads = ctx_->Threads();
     auto& entries = *p_entries;
     // All nodes are on the same level, so we can store the shared ptr.
@@ -306,7 +337,7 @@ class HistEvaluator {
       return features[nidx_in_set]->Size();
     }, grain_size);
 
-    std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
+    std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
     for (size_t i = 0; i < entries.size(); ++i) {
       for (decltype(n_threads) j = 0; j < n_threads; ++j) {
         tloc_candidates[i * n_threads + j] = entries[i];
@@ -365,22 +396,18 @@ class HistEvaluator {
     if (is_col_split_) {
       // With column-wise data split, we gather the best splits from all the workers and update the
       // expand entries accordingly.
-      auto const world = collective::GetWorldSize();
-      auto const rank = collective::GetRank();
-      auto const num_entries = entries.size();
-      std::vector<ExpandEntry> buffer{num_entries * world};
-      std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
-      collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
-      for (auto worker = 0; worker < world; ++worker) {
+      auto all_entries = Allgather(entries);
+      for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
         for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
-          entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
+          entries[nidx_in_set].split.Update(
+              all_entries[worker * entries.size() + nidx_in_set].split);
         }
       }
     }
   }
 
   // Add splits to tree, handles all statistic
-  void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
+  void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
     auto evaluator = tree_evaluator_.GetEvaluator();
     RegTree &tree = *p_tree;
 
@@ -465,6 +492,7 @@ class HistMultiEvaluator {
   FeatureInteractionConstraintHost interaction_constraints_;
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   Context const *ctx_;
+  bool is_col_split_{false};
 
  private:
   static double MultiCalcSplitGain(TrainParam const &param,
@@ -543,6 +571,57 @@ class HistMultiEvaluator {
     return false;
   }
 
+  /**
+   * @brief Gather the expand entries from all the workers.
+   * @param entries Local expand entries on this worker.
+   * @return Global expand entries gathered from all workers.
+   */
+  std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
+    auto const world = collective::GetWorldSize();
+    auto const rank = collective::GetRank();
+    auto const num_entries = entries.size();
+
+    // First, gather all the primitive fields.
+    std::vector<MultiExpandEntry> all_entries(num_entries * world);
+    std::vector<uint32_t> cat_bits;
+    std::vector<std::size_t> cat_bits_sizes;
+    std::vector<GradientPairPrecise> gradients;
+    for (auto i = 0; i < num_entries; i++) {
+      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
+                                                         &gradients);
+    }
+    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
+
+    // Gather all the cat_bits.
+    auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
+
+    // Gather all the gradients.
+    auto const num_gradients = gradients.size();
+    std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
+    std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
+    collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
+
+    auto const total_entries = num_entries * world;
+    auto const gradients_per_entry = num_gradients / num_entries;
+    auto const gradients_per_side = gradients_per_entry / 2;
+    common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
+      // Copy the cat_bits back into all expand entries.
+      all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
+      std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
+                  gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
+
+      // Copy the gradients back into all expand entries.
+      all_entries[i].split.left_sum.resize(gradients_per_side);
+      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
+                  all_entries[i].split.left_sum.begin());
+      all_entries[i].split.right_sum.resize(gradients_per_side);
+      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
+                  gradients_per_side, all_entries[i].split.right_sum.begin());
+    });
+
+    return all_entries;
+  }
+
  public:
   void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
                       common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@@ -597,6 +676,18 @@ class HistMultiEvaluator {
         entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
       }
     }
+
+    if (is_col_split_) {
+      // With column-wise data split, we gather the best splits from all the workers and update the
+      // expand entries accordingly.
+      auto all_entries = Allgather(entries);
+      for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
+        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+          entries[nidx_in_set].split.Update(
+              all_entries[worker * entries.size() + nidx_in_set].split);
+        }
+      }
+    }
   }
 
   linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
@@ -660,7 +751,10 @@ class HistMultiEvaluator {
 
   explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
                               std::shared_ptr<common::ColumnSampler> sampler)
-      : param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
+      : param_{param},
+        column_sampler_{std::move(sampler)},
+        ctx_{ctx},
+        is_col_split_{info.IsColumnSplit()} {
     interaction_constraints_.Configure(*param, info.num_col_);
     column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
                           param_->colsample_bynode, param_->colsample_bylevel,
diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h
index acd6edf2b..a481cc432 100644
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -70,6 +70,22 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
     os << "split:\n" << e.split << std::endl;
     return os;
   }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits into a vector.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   */
+  void CopyAndCollect(CPUExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
+                      std::vector<std::size_t>* cat_bits_sizes) {
+    nid = that.nid;
+    depth = that.depth;
+    split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes);
+  }
 };
 
 struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
@@ -119,6 +135,24 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
     os << "]\n";
     return os;
   }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits and gradients into vectors.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   * @param collected_gradients The vector to collect gradients
+   */
+  void CopyAndCollect(MultiExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
+                      std::vector<std::size_t>* cat_bits_sizes,
+                      std::vector<GradientPairPrecise>* collected_gradients) {
+    nid = that.nid;
+    depth = that.depth;
+    split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes, collected_gradients);
+  }
 };
 }  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
diff --git a/src/tree/param.h b/src/tree/param.h
index 0d59a5c35..e182fe539 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -419,6 +419,60 @@ struct SplitEntryContainer {
        << "right_sum: " << s.right_sum << std::endl;
     return os;
   }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits into a vector.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   */
+  void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
+                      std::vector<uint32_t> *collected_cat_bits,
+                      std::vector<std::size_t> *cat_bits_sizes) {
+    loss_chg = that.loss_chg;
+    sindex = that.sindex;
+    split_value = that.split_value;
+    is_cat = that.is_cat;
+    static_assert(std::is_trivially_copyable_v<GradientT>);
+    left_sum = that.left_sum;
+    right_sum = that.right_sum;
+    collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
+                               that.cat_bits.cend());
+    cat_bits_sizes->emplace_back(that.cat_bits.size());
+  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits and gradient sums into vectors.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   * @param collected_gradients The vector to collect gradients
+   */
+  template <typename G>
+  void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
+                      std::vector<uint32_t> *collected_cat_bits,
+                      std::vector<std::size_t> *cat_bits_sizes,
+                      std::vector<G> *collected_gradients) {
+    loss_chg = that.loss_chg;
+    sindex = that.sindex;
+    split_value = that.split_value;
+    is_cat = that.is_cat;
+    collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
+                               that.cat_bits.cend());
+    cat_bits_sizes->emplace_back(that.cat_bits.size());
+    static_assert(!std::is_trivially_copyable_v<GradientT>);
+    collected_gradients->insert(collected_gradients->end(), that.left_sum.cbegin(),
+                                that.left_sum.cend());
+    collected_gradients->insert(collected_gradients->end(), that.right_sum.cbegin(),
+                                that.right_sum.cend());
+  }
+
   /*!\return feature index to split on */
   [[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
   /*!\return whether missing value goes to left branch */
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index f637427ad..78506305f 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -44,7 +44,7 @@ class GloablApproxBuilder {
  protected:
   TrainParam const *param_;
   std::shared_ptr<common::ColumnSampler> col_sampler_;
-  HistEvaluator<CPUExpandEntry> evaluator_;
+  HistEvaluator evaluator_;
   HistogramBuilder<CPUExpandEntry> histogram_builder_;
   Context const *ctx_;
   ObjInfo const *const task_;
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index f0dd3dd12..68d74bea3 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -13,6 +13,7 @@
 #include <utility>                           // for move, swap
 #include <vector>                            // for vector
 
+#include "../collective/aggregator.h"        // for GlobalSum
 #include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
 #include "../collective/communicator.h"      // for Operation
 #include "../common/hist_util.h"             // for HistogramCuts, HistCollection
@@ -200,8 +201,8 @@ class MultiTargetHistBuilder {
       }
     }
     CHECK(root_sum.CContiguous());
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
+    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
+                          root_sum.Size() * 2);
 
     std::vector<MultiExpandEntry> nodes{best};
     std::size_t i = 0;
@@ -335,7 +336,7 @@ class HistBuilder {
   common::Monitor *monitor_;
   TrainParam const *param_;
   std::shared_ptr<common::ColumnSampler> col_sampler_;
-  std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
+  std::unique_ptr<HistEvaluator> evaluator_;
   std::vector<CommonRowPartitioner> partitioner_;
 
   // back pointers to tree and data matrix
@@ -354,7 +355,7 @@ class HistBuilder {
       : monitor_{monitor},
         param_{param},
         col_sampler_{std::move(column_sampler)},
-        evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
+        evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
                                                                    col_sampler_)},
         p_last_fmat_(fmat),
         histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
@@ -395,8 +396,7 @@ class HistBuilder {
     }
     histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
                               collective::IsDistributed(), fmat->Info().IsColumnSplit());
-    evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
-                                                                 col_sampler_);
+    evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
     p_last_tree_ = p_tree;
     monitor_->Stop(__func__);
   }
@@ -455,8 +455,7 @@ class HistBuilder {
         for (auto const &grad : gpair_h) {
           grad_stat.Add(grad.GetGrad(), grad.GetHess());
         }
-        collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
-                                                           2);
+        collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
       }
 
       auto weight = evaluator_->InitRoot(GradStats{grad_stat});
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 4103b34df..6e94a7510 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -23,6 +23,7 @@
 
 #include "../../src/collective/communicator-inl.h"
 #include "../../src/common/common.h"
+#include "../../src/common/threading_utils.h"
 #include "../../src/data/array_interface.h"
 #include "filesystem.h"  // dmlc::TemporaryDirectory
 #include "xgboost/linalg.h"
@@ -388,6 +389,23 @@ inline Context CreateEmptyGenericParam(int gpu_id) {
   return tparam;
 }
 
+inline std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(
+    std::size_t rows, bst_target_t n_targets = 1) {
+  auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows * n_targets);
+  auto& h_gradients = p_gradients->HostVector();
+
+  xgboost::SimpleLCG gen;
+  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
+
+  for (std::size_t i = 0; i < rows * n_targets; ++i) {
+    auto grad = dist(&gen);
+    auto hess = dist(&gen);
+    h_gradients[i] = GradientPair{grad, hess};
+  }
+
+  return p_gradients;
+}
+
 /**
  * \brief Make a context that uses CUDA.
  */
@@ -509,11 +527,7 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
     xgboost::collective::Finalize();
   };
 #if defined(_OPENMP)
-#pragma omp parallel num_threads(world_size)
-  {
-    auto rank = omp_get_thread_num();
-    run(rank);
-  }
+  common::ParallelFor(world_size, world_size, run);
 #else
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < world_size; rank++) {
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index 67a7d70e2..c4d303bb5 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -13,6 +13,7 @@
 
 #include "../../../plugin/federated/federated_server.h"
 #include "../../../src/collective/communicator-inl.h"
+#include "../../../src/common/threading_utils.h"
 
 namespace xgboost {
 
@@ -75,11 +76,7 @@ void RunWithFederatedCommunicator(int32_t world_size, std::string const& server_
     xgboost::collective::Finalize();
   };
 #if defined(_OPENMP)
-#pragma omp parallel num_threads(world_size)
-  {
-    auto rank = omp_get_thread_num();
-    run(rank);
-  }
+  common::ParallelFor(world_size, world_size, run);
 #else
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < world_size; rank++) {
diff --git a/tests/cpp/plugin/test_federated_learner.cc b/tests/cpp/plugin/test_federated_learner.cc
index b7066b6a0..ac514d169 100644
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/test_federated_learner.cc
@@ -15,9 +15,9 @@
 
 namespace xgboost {
 namespace {
-auto MakeModel(std::string objective, std::shared_ptr<DMatrix> dmat) {
+auto MakeModel(std::string tree_method, std::string objective, std::shared_ptr<DMatrix> dmat) {
   std::unique_ptr<Learner> learner{Learner::Create({dmat})};
-  learner->SetParam("tree_method", "approx");
+  learner->SetParam("tree_method", tree_method);
   learner->SetParam("objective", objective);
   if (objective.find("quantile") != std::string::npos) {
     learner->SetParam("quantile_alpha", "0.5");
@@ -35,7 +35,7 @@ auto MakeModel(std::string objective, std::shared_ptr<DMatrix> dmat) {
 }
 
 void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json expected_model,
-                     std::string objective) {
+                     std::string tree_method, std::string objective) {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
   std::shared_ptr<DMatrix> dmat{RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(rank == 0)};
@@ -61,7 +61,7 @@ void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json e
   }
   std::shared_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
 
-  auto model = MakeModel(objective, sliced);
+  auto model = MakeModel(tree_method, objective, sliced);
   auto base_score = GetBaseScore(model);
   ASSERT_EQ(base_score, expected_base_score);
   ASSERT_EQ(model, expected_model);
@@ -76,7 +76,7 @@ class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
   void SetUp() override { server_ = std::make_unique<ServerForTest>(kWorldSize); }
   void TearDown() override { server_.reset(nullptr); }
 
-  void Run(std::string objective) {
+  void Run(std::string tree_method, std::string objective) {
     static auto constexpr kRows{16};
     static auto constexpr kCols{16};
 
@@ -99,17 +99,22 @@ class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
       }
     }
 
-    auto model = MakeModel(objective, dmat);
+    auto model = MakeModel(tree_method, objective, dmat);
     auto score = GetBaseScore(model);
 
     RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyObjective, kRows, kCols,
-                                 score, model, objective);
+                                 score, model, tree_method, objective);
   }
 };
 
-TEST_P(FederatedLearnerTest, Objective) {
+TEST_P(FederatedLearnerTest, Approx) {
   std::string objective = GetParam();
-  this->Run(objective);
+  this->Run("approx", objective);
+}
+
+TEST_P(FederatedLearnerTest, Hist) {
+  std::string objective = GetParam();
+  this->Run("hist", objective);
 }
 
 INSTANTIATE_TEST_SUITE_P(FederatedLearnerObjective, FederatedLearnerTest,
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index c53d9d90b..677687255 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -33,7 +33,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
 
   auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix();
 
-  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
+  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
   common::HistCollection hist;
   std::vector<GradientPair> row_gpairs = {
       {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f},  {2.27f, 0.28f},
@@ -167,7 +167,7 @@ TEST(HistEvaluator, Apply) {
   param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
   auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
   auto sampler = std::make_shared<common::ColumnSampler>();
-  auto evaluator_ = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
+  auto evaluator_ = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
 
   CPUExpandEntry entry{0, 0};
   entry.split.loss_chg = 10.0f;
@@ -195,7 +195,7 @@ TEST_F(TestPartitionBasedSplit, CPUHist) {
   // check the evaluator is returning the optimal split
   std::vector<FeatureType> ft{FeatureType::kCategorical};
   auto sampler = std::make_shared<common::ColumnSampler>();
-  HistEvaluator<CPUExpandEntry> evaluator{&ctx, &param_, info_, sampler};
+  HistEvaluator evaluator{&ctx, &param_, info_, sampler};
   evaluator.InitRoot(GradStats{total_gpair_});
   RegTree tree;
   std::vector<CPUExpandEntry> entries(1);
@@ -225,7 +225,7 @@ auto CompareOneHotAndPartition(bool onehot) {
       RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix();
 
   auto sampler = std::make_shared<common::ColumnSampler>();
-  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
+  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
   std::vector<CPUExpandEntry> entries(1);
 
   for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>(&ctx, {32, param.sparse_threshold})) {
@@ -276,7 +276,7 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
   info.num_col_ = 1;
   info.feature_types = {FeatureType::kCategorical};
   Context ctx;
-  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param_, info, sampler};
+  auto evaluator = HistEvaluator{&ctx, &param_, info, sampler};
   evaluator.InitRoot(GradStats{parent_sum_});
 
   std::vector<CPUExpandEntry> entries(1);
diff --git a/tests/cpp/tree/test_constraints.cc b/tests/cpp/tree/test_constraints.cc
index 913dd8712..912d608a3 100644
--- a/tests/cpp/tree/test_constraints.cc
+++ b/tests/cpp/tree/test_constraints.cc
@@ -79,7 +79,7 @@ TEST(CPUMonoConstraint, Basic) {
   auto Xy = RandomDataGenerator{kRows, kCols, 0.0}.GenerateDMatrix(true);
   auto sampler = std::make_shared<common::ColumnSampler>();
 
-  HistEvaluator<CPUExpandEntry> evalutor{&ctx, &param, Xy->Info(), sampler};
+  HistEvaluator evalutor{&ctx, &param, Xy->Info(), sampler};
   evalutor.InitRoot(GradStats{2.0, 2.0});
 
   SplitEntry split;
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index 45308457c..8ba53e3f1 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -9,28 +9,20 @@
 #include "../helpers.h"
 
 namespace xgboost::tree {
-std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols){
-  return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
-}
-
-std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(std::size_t rows) {
-  auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows);
-  auto& h_gradients = p_gradients->HostVector();
-
-  xgboost::SimpleLCG gen;
-  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
-
-  for (std::size_t i = 0; i < rows; ++i) {
-    auto grad = dist(&gen);
-    auto hess = dist(&gen);
-    h_gradients[i] = GradientPair{grad, hess};
+std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols,
+                                         bool categorical = false) {
+  if (categorical) {
+    std::vector<FeatureType> ft(cols);
+    for (size_t i = 0; i < ft.size(); ++i) {
+      ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+    }
+    return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix();
+  } else {
+    return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
   }
-
-  return p_gradients;
 }
 
-TEST(GrowHistMaker, InteractionConstraint)
-{
+TEST(GrowHistMaker, InteractionConstraint) {
   auto constexpr kRows = 32;
   auto constexpr kCols = 16;
   auto p_dmat = GenerateDMatrix(kRows, kCols);
@@ -74,8 +66,9 @@ TEST(GrowHistMaker, InteractionConstraint)
 }
 
 namespace {
-void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_tree) {
-  auto p_dmat = GenerateDMatrix(rows, cols);
+void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
+                       RegTree const& expected_tree) {
+  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
   auto p_gradients = GenerateGradients(rows);
   Context ctx;
   ObjInfo task{ObjInfo::kRegression};
@@ -90,27 +83,21 @@ void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_t
   param.Init(Args{});
   updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
 
-  ASSERT_EQ(tree.NumExtraNodes(), 10);
-  ASSERT_EQ(tree[0].SplitIndex(), 1);
-
-  ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
-  ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
-
-  FeatureMap fmap;
-  auto json = tree.DumpModel(fmap, false, "json");
-  auto expected_json = expected_tree.DumpModel(fmap, false, "json");
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
   ASSERT_EQ(json, expected_json);
 }
-}  // anonymous namespace
 
-TEST(GrowHistMaker, ColumnSplit) {
+void TestColumnSplit(bool categorical) {
   auto constexpr kRows = 32;
   auto constexpr kCols = 16;
 
   RegTree expected_tree{1u, kCols};
   ObjInfo task{ObjInfo::kRegression};
   {
-    auto p_dmat = GenerateDMatrix(kRows, kCols);
+    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
     auto p_gradients = GenerateGradients(kRows);
     Context ctx;
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
@@ -121,6 +108,12 @@ TEST(GrowHistMaker, ColumnSplit) {
   }
 
   auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit, kRows, kCols, std::cref(expected_tree));
+  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, categorical,
+                              std::cref(expected_tree));
 }
+}  // anonymous namespace
+
+TEST(GrowHistMaker, ColumnSplitNumerical) { TestColumnSplit(false); }
+
+TEST(GrowHistMaker, ColumnSplitCategorical) { TestColumnSplit(true); }
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 9b9278021..b8d0fd2c0 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -194,11 +194,65 @@ void TestColumnSplitPartitioner(bst_target_t n_targets) {
 
   auto constexpr kWorkers = 4;
   RunWithInMemoryCommunicator(kWorkers, VerifyColumnSplitPartitioner<ExpandEntry>, n_targets,
-                              n_samples, n_features, base_rowid, Xy, min_value, mid_value, mid_partitioner);
+                              n_samples, n_features, base_rowid, Xy, min_value, mid_value,
+                              mid_partitioner);
 }
 }  // anonymous namespace
 
 TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEntry>(1); }
 
 TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
+
+namespace {
+void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, bst_target_t n_targets,
+                       RegTree const& expected_tree) {
+  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
+  auto p_gradients = GenerateGradients(rows, n_targets);
+  Context ctx;
+  ObjInfo task{ObjInfo::kRegression};
+  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+
+  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+
+  RegTree tree{n_targets, cols};
+  TrainParam param;
+  param.Init(Args{});
+  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
+  ASSERT_EQ(json, expected_json);
+}
+
+void TestColumnSplit(bst_target_t n_targets) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  RegTree expected_tree{n_targets, kCols};
+  ObjInfo task{ObjInfo::kRegression};
+  {
+    auto Xy = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+    auto p_gradients = GenerateGradients(kRows, n_targets);
+    Context ctx;
+    std::unique_ptr<TreeUpdater> updater{
+        TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    TrainParam param;
+    param.Init(Args{});
+    updater->Update(&param, p_gradients.get(), Xy.get(), position, {&expected_tree});
+  }
+
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, n_targets,
+                              std::cref(expected_tree));
+}
+}  // anonymous namespace
+
+TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
+
+TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
+
 }  // namespace xgboost::tree

From 614f47c477472118a20838b4880ab1ab51572c3c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 26 May 2023 18:42:24 +0800
Subject: [PATCH 03/19] Bump flink-clients from 1.17.0 to 1.17.1 in
 /jvm-packages (#9203)

Bumps [flink-clients](https://github.com/apache/flink) from 1.17.0 to 1.17.1.
- [Commits](https://github.com/apache/flink/compare/release-1.17.0...release-1.17.1)

---
updated-dependencies:
- dependency-name: org.apache.flink:flink-clients
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 490d5af28..c80bf8bc6 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -33,7 +33,7 @@
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.17.0</flink.version>
+        <flink.version>1.17.1</flink.version>
         <spark.version>3.4.0</spark.version>
         <spark.version.gpu>3.3.2</spark.version.gpu>
         <scala.version>2.12.17</scala.version>

From 053aababd480380ab1223ec19d4b9f28d2ce88d7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 27 May 2023 01:36:58 +0800
Subject: [PATCH 04/19] Avoid thrust logical operation. (#9199)

Thrust implementation of `thrust::all_of/any_of/none_of` adopts an early stopping strategy
to bailout early by dividing the input into small batches. This is not ideal for data
validation as we expect all data to be valid. The strategy leads to excessive kernel
launches and stream synchronization.

* Use reduce from dh instead.
---
 src/data/device_adapter.cuh | 25 ++++++++++++++++++-------
 src/data/ellpack_page.cu    |  2 +-
 src/data/simple_dmatrix.cuh |  2 +-
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 494fb7d1c..136fbb743 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -29,7 +29,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
       : columns_(columns),
         num_rows_(num_rows) {}
   size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ COOTuple GetElement(size_t idx) const {
+  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
     size_t column_idx = idx % columns_.size();
     size_t row_idx = idx / columns_.size();
     auto const& column = columns_[column_idx];
@@ -221,13 +221,24 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
  * \brief Check there's no inf in data.
  */
 template <typename AdapterBatchT>
-bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
+bool NoInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
   auto counting = thrust::make_counting_iterator(0llu);
-  auto value_iter = dh::MakeTransformIterator<float>(
-      counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
-  auto valid =
-      thrust::none_of(value_iter, value_iter + batch.Size(),
-                      [is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
+  auto value_iter = dh::MakeTransformIterator<bool>(counting, [=] XGBOOST_DEVICE(std::size_t idx) {
+    auto v = batch.GetElement(idx).value;
+    if (!is_valid(v)) {
+      // discard the invalid elements.
+      return true;
+    }
+    // check that there's no inf in data.
+    return !std::isinf(v);
+  });
+  dh::XGBCachingDeviceAllocator<char> alloc;
+  // The default implementation in thrust optimizes any_of/none_of/all_of by using small
+  // intervals to early stop. But we expect all data to be valid here, using small
+  // intervals only decreases performance due to excessive kernel launch and stream
+  // synchronization.
+  auto valid = dh::Reduce(thrust::cuda::par(alloc), value_iter, value_iter + batch.Size(), true,
+                          thrust::logical_and<>{});
   return valid;
 }
 };  // namespace data
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 4409a7ebb..aa218fa31 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -200,7 +200,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
   // correct output position
   auto counting = thrust::make_counting_iterator(0llu);
   data::IsValidFunctor is_valid(missing);
-  bool valid = data::HasInfInData(batch, is_valid);
+  bool valid = data::NoInfInData(batch, is_valid);
   CHECK(valid) << error::InfInData();
 
   auto key_iter = dh::MakeTransformIterator<size_t>(
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index 63310a929..e2c0ae347 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -64,7 +64,7 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
 template <typename AdapterBatchT>
 size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
                         SparsePage* page) {
-  bool valid = HasInfInData(batch, IsValidFunctor{missing});
+  bool valid = NoInfInData(batch, IsValidFunctor{missing});
   CHECK(valid) << error::InfInData();
 
   page->offset.SetDevice(device);

From c5677a2b2c692b60d4199a6814911beecd572385 Mon Sep 17 00:00:00 2001
From: michael-gendy-mention-me
 <107543707+michael-gendy-mention-me@users.noreply.github.com>
Date: Sat, 27 May 2023 00:48:28 +0100
Subject: [PATCH 05/19] Remove `type: ignore` hints (#9197)

---
 python-package/xgboost/core.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 3a27f5e18..200490d73 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -82,9 +82,10 @@ def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array
     if isinstance(data, str):
         return bytes(data, "utf-8")
     if isinstance(data, list):
-        pointers: ctypes.Array[ctypes.c_char_p] = (ctypes.c_char_p * len(data))()
-        data_as_bytes = [bytes(d, "utf-8") for d in data]
-        pointers[:] = data_as_bytes  # type: ignore
+        data_as_bytes: List[bytes] = [bytes(d, "utf-8") for d in data]
+        pointers: ctypes.Array[ctypes.c_char_p] = (
+            ctypes.c_char_p * len(data_as_bytes)
+        )(*data_as_bytes)
         return pointers
     raise TypeError()
 
@@ -319,7 +320,7 @@ def _cuda_array_interface(data: DataType) -> bytes:
 def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.ndarray:
     """Convert a ctypes pointer array to a numpy array."""
     ctype: Type[CNumeric] = _numpy2ctypes_type(dtype)
-    if not isinstance(cptr, ctypes.POINTER(ctype)):  # type: ignore
+    if not isinstance(cptr, ctypes.POINTER(ctype)):
         raise RuntimeError(f"expected {ctype} pointer")
     res = np.zeros(length, dtype=dtype)
     if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
@@ -2460,9 +2461,9 @@ class Booster:
             raise TypeError("Unknown file type: ", fname)
 
         if self.attr("best_iteration") is not None:
-            self.best_iteration = int(self.attr("best_iteration"))  # type: ignore
+            self.best_iteration = int(cast(int, self.attr("best_iteration")))
         if self.attr("best_score") is not None:
-            self.best_score = float(self.attr("best_score"))  # type: ignore
+            self.best_score = float(cast(float, self.attr("best_score")))
 
     def num_boosted_rounds(self) -> int:
         """Get number of boosted rounds.  For gblinear this is reset to 0 after

From 8c174ef2d391a1b2c89a8dcc8549806edbd02ea5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 27 May 2023 17:40:46 +0800
Subject: [PATCH 06/19] [CI] Update images that are not related to binary
 release. (#9205)

* [CI] Update images that are not related to the binary release.

- Update clang-tidy, prefer tools from the Ubuntu repository.
- Update GPU image to 22.04.
- Small cleanup to the tidy script.
- Remove gpu_jvm, which seems to be unused.
---
 tests/ci_build/Dockerfile.clang_tidy | 15 ++++----
 tests/ci_build/Dockerfile.gpu        |  4 +--
 tests/ci_build/Dockerfile.gpu_jvm    | 53 ----------------------------
 tests/ci_build/tidy.py               | 31 ++++++++++------
 4 files changed, 29 insertions(+), 74 deletions(-)
 delete mode 100644 tests/ci_build/Dockerfile.gpu_jvm

diff --git a/tests/ci_build/Dockerfile.clang_tidy b/tests/ci_build/Dockerfile.clang_tidy
index 967f24d3c..b7cd23042 100644
--- a/tests/ci_build/Dockerfile.clang_tidy
+++ b/tests/ci_build/Dockerfile.clang_tidy
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu22.04
 ARG CUDA_VERSION_ARG
 
 # Environment
@@ -7,22 +7,21 @@ ENV DEBIAN_FRONTEND noninteractive
 
 # Install all basic requirements
 RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
     apt-get update && \
-    apt-get install -y tar unzip wget git build-essential python3 python3-pip software-properties-common \
+    apt-get install -y wget git python3 python3-pip software-properties-common \
                        apt-transport-https ca-certificates gnupg-agent && \
-    wget -nv -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-    add-apt-repository -u 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main' && \
-    apt-get update && \
     apt-get install -y llvm-15 clang-tidy-15 clang-15 libomp-15-dev && \
-    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    apt-get install -y cmake
 
 # Set default clang-tidy version
 RUN \
     update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-15 100 && \
     update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 100
 
+RUN \
+    apt-get install libgtest-dev libgmock-dev -y
+
 # Install Python packages
 RUN \
     pip3 install pyyaml
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index 3b5701693..4011ad834 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04
 ARG CUDA_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
 
@@ -9,7 +9,7 @@ SHELL ["/bin/bash", "-c"]   # Use Bash as shell
 
 # Install all basic requirements
 RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
     apt-get update && \
     apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
     # Python
diff --git a/tests/ci_build/Dockerfile.gpu_jvm b/tests/ci_build/Dockerfile.gpu_jvm
deleted file mode 100644
index 7883d189e..000000000
--- a/tests/ci_build/Dockerfile.gpu_jvm
+++ /dev/null
@@ -1,53 +0,0 @@
-ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu16.04
-ARG CUDA_VERSION_ARG
-ARG JDK_VERSION=8
-ARG SPARK_VERSION=3.0.0
-
-# Environment
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install all basic requirements
-RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub && \
-    apt-get update && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \
-    # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
-    bash conda.sh -b -p /opt/mambaforge && \
-    /opt/mambaforge/bin/pip install awscli && \
-    # Maven
-    wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
-    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
-    ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
-    # Spark
-    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
-    tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
-    ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
-
-ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH
-
-# Install Python packages
-RUN \
-    pip install numpy scipy pandas scikit-learn
-
-ENV GOSU_VERSION 1.10
-
-# Install lightweight sudo (not bound to TTY)
-RUN set -ex; \
-    wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
-    chmod +x /usr/local/bin/gosu && \
-    gosu nobody true
-
-# Set default JDK version
-RUN update-java-alternatives -v -s java-1.$JDK_VERSION.0-openjdk-amd64
-
-# Default entry-point to use if running locally
-# It will preserve attributes of created files
-COPY entrypoint.sh /scripts/
-
-WORKDIR /workspace
-ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/tidy.py b/tests/ci_build/tidy.py
index 33e153850..16fb07e0d 100755
--- a/tests/ci_build/tidy.py
+++ b/tests/ci_build/tidy.py
@@ -41,7 +41,7 @@ class ClangTidy(object):
     def __init__(self, args):
         self.cpp_lint = args.cpp
         self.cuda_lint = args.cuda
-        self.use_dmlc_gtest = args.use_dmlc_gtest
+        self.use_dmlc_gtest: bool = args.use_dmlc_gtest
         self.cuda_archs = args.cuda_archs.copy() if args.cuda_archs else []
 
         if args.tidy_version:
@@ -202,6 +202,7 @@ class ClangTidy(object):
         cdb_file = os.path.join(self.cdb_path, 'compile_commands.json')
         with open(cdb_file, 'r') as fd:
             self.compile_commands = json.load(fd)
+
         tidy_file = os.path.join(self.root_path, '.clang-tidy')
         with open(tidy_file) as fd:
             self.clang_tidy = yaml.safe_load(fd)
@@ -276,16 +277,24 @@ right keywords?
     print('clang-tidy is working.')
 
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Run clang-tidy.')
-    parser.add_argument('--cpp', type=int, default=1)
-    parser.add_argument('--tidy-version', type=int, default=None,
-                        help='Specify the version of preferred clang-tidy.')
-    parser.add_argument('--cuda', type=int, default=1)
-    parser.add_argument('--use-dmlc-gtest', type=int, default=1,
-                        help='Whether to use gtest bundled in dmlc-core.')
-    parser.add_argument('--cuda-archs', action='append',
-                        help='List of CUDA archs to build')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run clang-tidy.")
+    parser.add_argument("--cpp", type=int, default=1)
+    parser.add_argument(
+        "--tidy-version",
+        type=int,
+        default=None,
+        help="Specify the version of preferred clang-tidy.",
+    )
+    parser.add_argument("--cuda", type=int, default=1)
+    parser.add_argument(
+        "--use-dmlc-gtest",
+        action="store_true",
+        help="Whether to use gtest bundled in dmlc-core.",
+    )
+    parser.add_argument(
+        "--cuda-archs", action="append", help="List of CUDA archs to build"
+    )
     args = parser.parse_args()
 
     test_tidy(args)

From a01df102c90b9033e3034461830095693234001b Mon Sep 17 00:00:00 2001
From: Boris <mail@dotbg.name>
Date: Sat, 27 May 2023 13:34:02 +0200
Subject: [PATCH 07/19] Scala 2.13 support. (#9099)

1. Updated the test logic
2. Added smoke tests for Spark examples.
3. Added integration tests for Spark with Scala 2.13
---
 .github/workflows/jvm_tests.yml               |  10 ++
 jvm-packages/.gitignore                       |   2 +
 jvm-packages/README.md                        |  29 ++++-
 jvm-packages/pom.xml                          |  25 +++-
 jvm-packages/xgboost4j-example/pom.xml        |   5 +-
 .../example/flink/DistTrainWithFlink.scala    |  13 +-
 .../example/spark/SparkMLlibPipeline.scala    |  18 ++-
 .../scala/example/spark/SparkTraining.scala   |  12 +-
 .../example/spark/SparkExamplesTest.scala     | 123 ++++++++++++++++++
 jvm-packages/xgboost4j-flink/pom.xml          |   4 +-
 jvm-packages/xgboost4j-gpu/pom.xml            |   9 +-
 jvm-packages/xgboost4j-spark-gpu/pom.xml      |   7 +-
 jvm-packages/xgboost4j-spark/pom.xml          |   7 +-
 jvm-packages/xgboost4j-tester/generate_pom.py |  71 ++--------
 .../ml/dmlc/xgboost4j/tester/AppTest.java     |  20 ---
 jvm-packages/xgboost4j/pom.xml                |   9 +-
 .../ml/dmlc/xgboost4j/scala/EvalTrait.scala   |   2 +-
 .../dmlc/xgboost4j/scala/ObjectiveTrait.scala |   2 +-
 .../ml/dmlc/xgboost4j/scala/XGBoost.scala     |  20 +--
 tests/buildkite/build-jvm-packages.sh         |   9 +-
 tests/buildkite/conftest.sh                   |   2 +-
 tests/ci_build/Dockerfile.jvm_cross           |  12 +-
 tests/ci_build/build_jvm_packages.sh          |   9 +-
 tests/ci_build/test_jvm_cross.sh              |  65 +++++----
 24 files changed, 325 insertions(+), 160 deletions(-)
 create mode 100644 jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
 delete mode 100644 jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java

diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index a2d8bb69a..79aac0f0b 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -75,3 +75,13 @@ jobs:
       if: matrix.os == 'ubuntu-latest'  # Distributed training doesn't work on Windows
       env:
         RABIT_MOCK: ON
+
+
+    - name: Build and Test XGBoost4J with scala 2.13
+      run: |
+        rm -rfv build/
+        cd jvm-packages
+        mvn -B clean install test -Pdefault,scala-2.13
+      if: matrix.os == 'ubuntu-latest'  # Distributed training doesn't work on Windows
+      env:
+        RABIT_MOCK: ON
diff --git a/jvm-packages/.gitignore b/jvm-packages/.gitignore
index 6d3f7b7cb..e2dc7967a 100644
--- a/jvm-packages/.gitignore
+++ b/jvm-packages/.gitignore
@@ -1,2 +1,4 @@
 tracker.py
 build.sh
+xgboost4j-tester/pom.xml
+xgboost4j-tester/iris.csv
diff --git a/jvm-packages/README.md b/jvm-packages/README.md
index c4c8898dd..239464342 100644
--- a/jvm-packages/README.md
+++ b/jvm-packages/README.md
@@ -36,6 +36,19 @@ XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5.
     <version>latest_version_num</version>
 </dependency>
 ```
+or 
+```
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j_2.13</artifactId>
+    <version>latest_version_num</version>
+</dependency>
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.13</artifactId>
+    <version>latest_version_num</version>
+</dependency>
+```
 
 <b>sbt</b>
 ```sbt
@@ -47,7 +60,6 @@ libraryDependencies ++= Seq(
 
 For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).
 
-To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
 
 ### Access SNAPSHOT version
 
@@ -85,6 +97,19 @@ Then add XGBoost4J as a dependency:
     <version>latest_version_num-SNAPSHOT</version>
 </dependency>
 ```
+or with scala 2.13 
+```
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j_2.13</artifactId>
+    <version>latest_version_num-SNAPSHOT</version>
+</dependency>
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.13</artifactId>
+    <version>latest_version_num-SNAPSHOT</version>
+</dependency>
+```
 
 <b>sbt</b>
 ```sbt
@@ -96,7 +121,9 @@ libraryDependencies ++= Seq(
 
 For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html).
 
+### GPU algorithm
 To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
+Note that scala 2.13 is not supported by the [NVIDIA/spark-rapids#1525](https://github.com/NVIDIA/spark-rapids/issues/1525) yet, so the GPU algorithm can only be used with scala 2.12.
 
 ## Examples
 
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index c80bf8bc6..fa0cb5344 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -5,7 +5,7 @@
     <modelVersion>4.0.0</modelVersion>
 
     <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost-jvm_2.12</artifactId>
+    <artifactId>xgboost-jvm</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
@@ -34,6 +34,7 @@
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
         <flink.version>1.17.1</flink.version>
+        <junit.version>4.13.2</junit.version>
         <spark.version>3.4.0</spark.version>
         <spark.version.gpu>3.3.2</spark.version.gpu>
         <scala.version>2.12.17</scala.version>
@@ -45,7 +46,9 @@
         <cudf.version>23.04.0</cudf.version>
         <spark.rapids.version>23.04.1</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
-    </properties>
+        <scalatest.version>3.2.16</scalatest.version>
+        <scala-collection-compat.version>2.9.0</scala-collection-compat.version>
+      </properties>
     <repositories>
         <repository>
             <id>central_maven</id>
@@ -71,6 +74,14 @@
             </modules>
         </profile>
 
+        <profile>
+            <id>scala-2.13</id>
+            <properties>
+                <scala.binary.version>2.13</scala.binary.version>
+                <scala.version>2.13.10</scala.version>
+            </properties>
+        </profile>
+
         <!-- gpu profile with both cpu and gpu test suites -->
         <profile>
             <id>gpu</id>
@@ -467,6 +478,7 @@
         </plugins>
     </reporting>
     <dependencies>
+
         <dependency>
             <groupId>com.esotericsoftware</groupId>
             <artifactId>kryo</artifactId>
@@ -483,6 +495,11 @@
             <artifactId>scala-library</artifactId>
             <version>${scala.version}</version>
         </dependency>
+        <dependency>
+          <groupId>org.scala-lang.modules</groupId>
+          <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
+          <version>${scala-collection-compat.version}</version>
+        </dependency>
         <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
@@ -491,13 +508,13 @@
         <dependency>
             <groupId>org.scalatest</groupId>
             <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.2.16</version>
+            <version>${scalatest.version}</version>
             <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.scalactic</groupId>
             <artifactId>scalactic_${scala.binary.version}</artifactId>
-            <version>3.2.15</version>
+            <version>${scalatest.version}</version>
             <scope>test</scope>
         </dependency>
     </dependencies>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index 40c9c72a4..e6ed8a600 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -5,10 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j-example_2.12</artifactId>
+    <name>xgboost4j-example</name>
+    <artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <build>
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
index cb859f62d..3bfefb841 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
@@ -73,12 +73,13 @@ object DistTrainWithFlink {
           .map(_.f1.f0)
           .returns(testDataTypeHint)
 
-    val paramMap = mapAsJavaMap(Map(
-      ("eta", "0.1".asInstanceOf[AnyRef]),
-      ("max_depth", "2"),
-      ("objective", "binary:logistic"),
-      ("verbosity", "1")
-    ))
+    val paramMap = Map(
+        ("eta", "0.1".asInstanceOf[AnyRef]),
+        ("max_depth", "2"),
+        ("objective", "binary:logistic"),
+        ("verbosity", "1")
+      )
+      .asJava
 
     // number of iterations
     val round = 2
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
index 6d676b0ae..b8da31c09 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
@@ -20,10 +20,9 @@ import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.tuning._
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types._
-
-import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}
+import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
 
 // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
 
@@ -50,6 +49,13 @@ object SparkMLlibPipeline {
       .appName("XGBoost4J-Spark Pipeline Example")
       .getOrCreate()
 
+    run(spark, inputPath, nativeModelPath, pipelineModelPath, treeMethod, numWorkers)
+      .show(false)
+  }
+  private[spark] def run(spark: SparkSession, inputPath: String, nativeModelPath: String,
+                         pipelineModelPath: String, treeMethod: String,
+                         numWorkers: Int): DataFrame = {
+
     // Load dataset
     val schema = new StructType(Array(
       StructField("sepal length", DoubleType, true),
@@ -90,11 +96,11 @@ object SparkMLlibPipeline {
     val labelConverter = new IndexToString()
       .setInputCol("prediction")
       .setOutputCol("realLabel")
-      .setLabels(labelIndexer.labels)
+      .setLabels(labelIndexer.labelsArray(0))
 
     val pipeline = new Pipeline()
       .setStages(Array(assembler, labelIndexer, booster, labelConverter))
-    val model = pipeline.fit(training)
+    val model: PipelineModel = pipeline.fit(training)
 
     // Batch prediction
     val prediction = model.transform(test)
@@ -136,6 +142,6 @@ object SparkMLlibPipeline {
 
     // Load a saved model and serving
     val model2 = PipelineModel.load(pipelineModelPath)
-    model2.transform(test).show(false)
+    model2.transform(test)
   }
 }
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
index 17a32bc09..a7886f524 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
@@ -17,9 +17,8 @@
 package ml.dmlc.xgboost4j.scala.example.spark
 
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
-
 import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 
 // this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)
@@ -38,6 +37,12 @@ object SparkTraining {
 
     val spark = SparkSession.builder().getOrCreate()
     val inputPath = args(0)
+    val results: DataFrame = run(spark, inputPath, treeMethod, numWorkers)
+    results.show()
+  }
+
+private[spark] def run(spark: SparkSession, inputPath: String,
+                       treeMethod: String, numWorkers: Int): DataFrame =  {
     val schema = new StructType(Array(
       StructField("sepal length", DoubleType, true),
       StructField("sepal width", DoubleType, true),
@@ -81,7 +86,6 @@ object SparkTraining {
       setFeaturesCol("features").
       setLabelCol("classIndex")
     val xgbClassificationModel = xgbClassifier.fit(train)
-    val results = xgbClassificationModel.transform(test)
-    results.show()
+    xgbClassificationModel.transform(test)
   }
 }
diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
new file mode 100644
index 000000000..f6cb700df
--- /dev/null
+++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
@@ -0,0 +1,123 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.scala.example.spark
+
+import org.apache.spark.sql.SparkSession
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+import org.slf4j.LoggerFactory
+
+import java.io.File
+import java.nio.file.{Files, StandardOpenOption}
+import scala.jdk.CollectionConverters._
+import scala.util.{Random, Try}
+
+class SparkExamplesTest extends AnyFunSuite with BeforeAndAfterAll {
+  private val logger = LoggerFactory.getLogger(classOf[SparkExamplesTest])
+  private val random = new Random(42)
+  protected val numWorkers: Int = scala.math.min(Runtime.getRuntime.availableProcessors(), 4)
+
+  private val pathToTestDataset = Files.createTempFile("", "iris.csv").toAbsolutePath
+  private var spark: SparkSession = _
+
+  override def beforeAll(): Unit = {
+
+    def generateLine(i: Int): String = {
+      val getIrisName = (int: Int) => {
+        int % 3 match {
+          case 0 => "Iris-versicolor"
+          case 1 => "Iris-virginica"
+          case 2 => "Iris-setosa"
+        }
+      }
+      val generateValue = () => Math.abs(random.nextInt(99) * 0.1)
+      val sepalLength = generateValue()
+      val sepalWidth = generateValue()
+      val petalLength = generateValue()
+      val petalWidth = generateValue()
+      val irisName = getIrisName(Math.abs(random.nextInt()) + i)
+      s"$sepalLength,$sepalWidth,$petalLength,$petalWidth,$irisName"
+    }
+
+    if (spark == null) {
+       spark = SparkSession
+        .builder()
+        .appName("XGBoost4J-Spark Pipeline Example")
+        .master(s"local[${numWorkers}]")
+        .config("spark.ui.enabled", value = false)
+        .config("spark.driver.memory", "512m")
+        .config("spark.barrier.sync.timeout", 10)
+        .config("spark.task.cpus", 1)
+        .getOrCreate()
+      spark.sparkContext.setLogLevel("ERROR")
+    }
+    val data = (0 until 150)
+      .map(i => generateLine(i))
+      .toList
+      .asJava
+    Files.write(pathToTestDataset,
+      data,
+      StandardOpenOption.CREATE,
+      StandardOpenOption.WRITE,
+      StandardOpenOption.TRUNCATE_EXISTING)
+    logger.info(s"${new String(Files.readAllBytes(pathToTestDataset))}")
+
+  }
+
+  override def afterAll(): Unit = {
+    if (spark != null) {
+      spark.stop()
+      cleanExternalCache(spark.sparkContext.appName)
+      spark = null
+    }
+
+    Try(Files.deleteIfExists(pathToTestDataset))
+      .recover {
+        case e =>
+          logger.warn(
+            s"Could not delete temporary file $pathToTestDataset. Please, remove it manually",
+            e
+          )
+          true
+    }
+  }
+
+  private def cleanExternalCache(prefix: String): Unit = {
+    val dir = new File(".")
+    for (file <- dir.listFiles() if file.getName.startsWith(prefix)) {
+      file.delete()
+    }
+  }
+
+  test("Smoke test for SparkMLlibPipeline example") {
+    SparkMLlibPipeline.run(spark, pathToTestDataset.toString, "target/native-model",
+      "target/pipeline-model", "auto", 2)
+  }
+
+  test("Smoke test for SparkTraining example") {
+    val spark = SparkSession
+      .builder()
+      .appName("XGBoost4J-Spark Pipeline Example")
+      .master(s"local[${numWorkers}]")
+      .config("spark.ui.enabled", value = false)
+      .config("spark.driver.memory", "512m")
+      .config("spark.barrier.sync.timeout", 10)
+      .config("spark.task.cpus", 1)
+      .getOrCreate()
+
+    SparkTraining.run(spark, pathToTestDataset.toString, "auto", 2)
+  }
+}
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index a9a80e29a..8d51a9dcf 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -5,9 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
+
+    <name>xgboost4j-flink</name>
     <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <properties>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 1d7a06708..f34680302 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -5,10 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j-gpu_2.12</artifactId>
+    <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
+    <name>xgboost4j-gpu</name>
     <version>2.0.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
@@ -35,13 +36,13 @@
         <dependency>
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
-            <version>4.13.2</version>
+            <version>${junit.version}</version>
             <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.scalatest</groupId>
             <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.2.15</version>
+            <version>${scalatest.version}</version>
             <scope>provided</scope>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index 57770be5a..b7be69e69 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -5,10 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
+    <name>xgboost4j-spark-gpu</name>
+    <artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>
     <build>
         <plugins>
             <plugin>
@@ -24,7 +25,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
+            <version>${project.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 3a84233d1..d8f4cb914 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -5,10 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j-spark_2.12</artifactId>
+    <name>xgboost4j-spark</name>
+    <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
     <build>
         <plugins>
             <plugin>
@@ -24,7 +25,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
+            <version>${project.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py
index cc391dd00..b9c274c28 100644
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@@ -8,25 +8,28 @@ pom_template = """
   <modelVersion>4.0.0</modelVersion>
 
   <groupId>ml.dmlc</groupId>
-  <artifactId>xgboost4j-tester_2.12</artifactId>
+  <artifactId>xgboost4j-tester_{scala_binary_version}</artifactId>
   <version>1.0-SNAPSHOT</version>
 
-  <name>xgboost4j-tester_2.12</name>
+  <name>xgboost4j-tester</name>
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <maven.compiler.source>{maven_compiler_source}</maven.compiler.source>
     <maven.compiler.target>{maven_compiler_target}</maven.compiler.target>
+    <junit.version>4.13.2</junit.version>
     <spark.version>{spark_version}</spark.version>
     <scala.version>{scala_version}</scala.version>
+    <scalatest.version>3.2.15</scalatest.version>
     <scala.binary.version>{scala_binary_version}</scala.binary.version>
+    <kryo.version>5.5.0</kryo.version>
   </properties>
 
   <dependencies>
-    <dependency>
+   <dependency>
       <groupId>com.esotericsoftware</groupId>
       <artifactId>kryo</artifactId>
-      <version>4.0.2</version>
+      <version>${{kryo.version}}</version>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
@@ -48,29 +51,12 @@ pom_template = """
       <artifactId>commons-logging</artifactId>
       <version>1.2</version>
     </dependency>
-    <dependency>
-      <groupId>com.typesafe.akka</groupId>
-      <artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
-      <version>2.6.20</version>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${{scala.binary.version}}</artifactId>
-      <version>3.0.8</version>
+      <version>${{scalatest.version}}</version>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>org.scalactic</groupId>
-      <artifactId>scalactic_${{scala.binary.version}}</artifactId>
-      <version>3.2.15</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-lang3</artifactId>
-      <version>3.9</version>
-    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${{scala.binary.version}}</artifactId>
@@ -92,7 +78,7 @@ pom_template = """
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
-      <version>4.13.2</version>
+      <version>${{junit.version}}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
@@ -122,36 +108,9 @@ pom_template = """
 
   <build>
     <plugins>
-      <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
-      <plugin>
-        <artifactId>maven-clean-plugin</artifactId>
-        <version>3.1.0</version>
-      </plugin>
-      <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
-      <plugin>
-        <artifactId>maven-resources-plugin</artifactId>
-        <version>3.0.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-compiler-plugin</artifactId>
-        <version>3.8.0</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-jar-plugin</artifactId>
-        <version>3.0.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-install-plugin</artifactId>
-        <version>2.5.2</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <version>2.8.2</version>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-assembly-plugin</artifactId>
-        <version>2.4</version>
         <configuration>
           <descriptorRefs>
             <descriptorRef>jar-with-dependencies</descriptorRef>
@@ -171,22 +130,12 @@ pom_template = """
           </execution>
         </executions>
       </plugin>
-      <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
-      <plugin>
-        <artifactId>maven-site-plugin</artifactId>
-        <version>3.7.1</version>
-      </plugin>
-      <plugin>
-        <artifactId>maven-project-info-reports-plugin</artifactId>
-        <version>3.0.0</version>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
-        <version>2.22.1</version>
         <configuration>
           <dependenciesToScan>
-            <dependency>ml.dmlc:xgboost4j_2.12</dependency>
+            <dependency>ml.dmlc:xgboost4j_${{scala.binary.version}}</dependency>
           </dependenciesToScan>
         </configuration>
       </plugin>
diff --git a/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java b/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java
deleted file mode 100644
index 2df693748..000000000
--- a/jvm-packages/xgboost4j-tester/src/test/java/ml/dmlc/xgboost4j/tester/AppTest.java
+++ /dev/null
@@ -1,20 +0,0 @@
-package ml.dmlc.xgboost4j.tester;
-
-import static org.junit.Assert.assertTrue;
-
-import org.junit.Test;
-
-/**
- * Unit test for simple App.
- */
-public class AppTest 
-{
-    /**
-     * Rigorous Test :-)
-     */
-    @Test
-    public void shouldAnswerWithTrue()
-    {
-        assertTrue( true );
-    }
-}
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 17ed66a12..4352aab12 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -5,10 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm_2.12</artifactId>
+        <artifactId>xgboost-jvm</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j_2.12</artifactId>
+    <name>xgboost4j</name>
+    <artifactId>xgboost4j_${scala.binary.version}</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
@@ -28,13 +29,13 @@
         <dependency>
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
-            <version>4.13.2</version>
+            <version>${junit.version}</version>
             <scope>test</scope>
         </dependency>
         <dependency>
           <groupId>org.scalatest</groupId>
           <artifactId>scalatest_${scala.binary.version}</artifactId>
-          <version>3.2.16</version>
+          <version>${scalatest.version}</version>
           <scope>provided</scope>
         </dependency>
     </dependencies>
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala
index 587ace352..fe17804fd 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/EvalTrait.scala
@@ -37,7 +37,7 @@ trait EvalTrait extends IEvaluation {
    */
   def eval(predicts: Array[Array[Float]], dmat: DMatrix): Float
 
-  private[scala] def eval(predicts: Array[Array[Float]], jdmat: java.DMatrix): Float = {
+  def eval(predicts: Array[Array[Float]], jdmat: java.DMatrix): Float = {
     require(predicts.length == jdmat.getLabel.length, "predicts size and label size must match " +
       s" predicts size: ${predicts.length}, label size: ${jdmat.getLabel.length}")
     eval(predicts, new DMatrix(jdmat))
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala
index 24e603762..de218f0c5 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/ObjectiveTrait.scala
@@ -31,7 +31,7 @@ trait ObjectiveTrait extends IObjective {
    */
   def getGradient(predicts: Array[Array[Float]], dtrain: DMatrix): List[Array[Float]]
 
-  private[scala] def getGradient(predicts: Array[Array[Float]], dtrain: JDMatrix):
+  def getGradient(predicts: Array[Array[Float]], dtrain: JDMatrix):
     java.util.List[Array[Float]] = {
     getGradient(predicts, new DMatrix(dtrain)).asJava
   }
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala
index 90d06c343..50d86c893 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/XGBoost.scala
@@ -17,12 +17,11 @@
 package ml.dmlc.xgboost4j.scala
 
 import java.io.InputStream
+import ml.dmlc.xgboost4j.java.{XGBoostError, XGBoost => JXGBoost}
 
-import ml.dmlc.xgboost4j.java.{XGBoostError, Booster => JBooster, XGBoost => JXGBoost}
-import scala.collection.JavaConverters._
-
+import scala.jdk.CollectionConverters._
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.Path
 
 /**
   * XGBoost Scala Training function.
@@ -40,7 +39,12 @@ object XGBoost {
       earlyStoppingRound: Int = 0,
       prevBooster: Booster,
       checkpointParams: Option[ExternalCheckpointParams]): Booster = {
-    val jWatches = watches.mapValues(_.jDMatrix).asJava
+
+    // we have to filter null value for customized obj and eval
+    val jParams: java.util.Map[String, AnyRef] =
+      params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).toMap.asJava
+
+    val jWatches = watches.mapValues(_.jDMatrix).toMap.asJava
     val jBooster = if (prevBooster == null) {
       null
     } else {
@@ -51,8 +55,7 @@ object XGBoost {
       map(cp => {
           JXGBoost.trainAndSaveCheckpoint(
             dtrain.jDMatrix,
-            // we have to filter null value for customized obj and eval
-            params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).asJava,
+            jParams,
             numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster,
             cp.checkpointInterval,
             cp.checkpointPath,
@@ -61,8 +64,7 @@ object XGBoost {
       getOrElse(
         JXGBoost.train(
           dtrain.jDMatrix,
-          // we have to filter null value for customized obj and eval
-          params.filter(_._2 != null).mapValues(_.toString.asInstanceOf[AnyRef]).asJava,
+          jParams,
           numRounds, jWatches, metrics, obj, eval, earlyStoppingRound, jBooster)
       )
     if (prevBooster == null) {
diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh
index 1de43bbd0..33cfffe71 100755
--- a/tests/buildkite/build-jvm-packages.sh
+++ b/tests/buildkite/build-jvm-packages.sh
@@ -4,11 +4,18 @@ set -euo pipefail
 
 source tests/buildkite/conftest.sh
 
-echo "--- Build XGBoost JVM packages"
+echo "--- Build XGBoost JVM packages scala 2.12"
 tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
   ${SPARK_VERSION}
 
+
+echo "--- Build XGBoost JVM packages scala 2.13"
+
+tests/ci_build/ci_build.sh jvm docker tests/ci_build/build_jvm_packages.sh \
+  ${SPARK_VERSION} "" "" "true"
+
 echo "--- Stash XGBoost4J JARs"
 buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
+buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index cf9270c11..957dd443c 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -25,7 +25,7 @@ set -x
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
 RAPIDS_VERSION=23.02
-SPARK_VERSION=3.1.1
+SPARK_VERSION=3.4.0
 JDK_VERSION=8
 
 if [[ -z ${BUILDKITE:-} ]]
diff --git a/tests/ci_build/Dockerfile.jvm_cross b/tests/ci_build/Dockerfile.jvm_cross
index 6d9c5c57f..fdfae310a 100644
--- a/tests/ci_build/Dockerfile.jvm_cross
+++ b/tests/ci_build/Dockerfile.jvm_cross
@@ -20,10 +20,14 @@ RUN \
     wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
     tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
     ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
-    # Spark
-    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop2.7.tgz && \
-    tar xvf spark-$SPARK_VERSION-bin-hadoop2.7.tgz -C /opt && \
-    ln -s /opt/spark-$SPARK_VERSION-bin-hadoop2.7 /opt/spark
+    # Spark with scala 2.12
+    mkdir -p /opt/spark-scala-2.12 && \
+    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \
+    tar xvf spark-$SPARK_VERSION-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \
+    # Spark with scala 2.13
+    mkdir -p /opt/spark-scala-2.13 && \
+    wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz && \
+    tar xvf spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13
 
 ENV PATH=/opt/mambaforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH
 
diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh
index 241fc445f..5797a1f61 100755
--- a/tests/ci_build/build_jvm_packages.sh
+++ b/tests/ci_build/build_jvm_packages.sh
@@ -6,6 +6,7 @@ set -x
 spark_version=$1
 use_cuda=$2
 gpu_arch=$3
+use_scala213=$4
 
 gpu_options=""
 if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then
@@ -22,7 +23,13 @@ export RABIT_MOCK=ON
 if [ "x$gpu_arch" != "x" ]; then
   export GPU_ARCH_FLAG=$gpu_arch
 fi
-mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
+
+mvn_profile_string=""
+if [ "x$use_scala213" != "x" ]; then
+  export mvn_profile_string="-Pdefault,scala-2.13"
+fi
+
+mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options
 
 set +x
 set +e
diff --git a/tests/ci_build/test_jvm_cross.sh b/tests/ci_build/test_jvm_cross.sh
index 378846d65..18265cf01 100755
--- a/tests/ci_build/test_jvm_cross.sh
+++ b/tests/ci_build/test_jvm_cross.sh
@@ -6,37 +6,56 @@ set -x
 # Initialize local Maven repository
 ./tests/ci_build/initialize_maven.sh
 
-# Get version number of XGBoost4J and other auxiliary information
 cd jvm-packages
+jvm_packages_dir=`pwd`
+# Get version number of XGBoost4J and other auxiliary information
 xgboost4j_version=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
 maven_compiler_source=$(mvn help:evaluate -Dexpression=maven.compiler.source -q -DforceStdout)
 maven_compiler_target=$(mvn help:evaluate -Dexpression=maven.compiler.target -q -DforceStdout)
 spark_version=$(mvn help:evaluate -Dexpression=spark.version -q -DforceStdout)
-scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
-scala_binary_version=$(mvn help:evaluate -Dexpression=scala.binary.version -q -DforceStdout)
 
-# Install XGBoost4J JAR into local Maven repository
-mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
-mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
-mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
-mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
-
-cd xgboost4j-tester
-# Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
-python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
-# Run unit tests with XGBoost4J
-mvn --no-transfer-progress package
-
-# Run integration tests with XGBoost4J
-java -jar ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar
-
-# Run integration tests with XGBoost4J-Spark
-if [ ! -z "$RUN_INTEGRATION_TEST" ]
-then
+if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
+  cd $jvm_packages_dir/xgboost4j-tester
   python3 get_iris.py
-  spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv
-  spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ./target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar ${PWD}/iris.csv ${PWD}/native_model ${PWD}/pipeline_model
+  cd $jvm_packages_dir
 fi
 
+# including maven profiles for different scala versions: 2.12 is the default at the moment.
+for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
+  scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
+  scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
+
+  # Install XGBoost4J JAR into local Maven repository
+  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
+  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}-tests.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=test-jar -Dclassifier=tests
+  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-spark/target/xgboost4j-spark_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-spark_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
+  mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j-example/target/xgboost4j-example_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j-example_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
+
+  cd xgboost4j-tester
+  # Generate pom.xml for XGBoost4J-tester, a dummy project to run XGBoost4J tests
+  python3 ./generate_pom.py ${xgboost4j_version} ${maven_compiler_source} ${maven_compiler_target} ${spark_version} ${scala_version} ${scala_binary_version}
+  # Build package and unit tests with XGBoost4J
+  mvn --no-transfer-progress clean package
+  xgboost4j_tester_jar="$jvm_packages_dir/xgboost4j-tester/target/xgboost4j-tester_${scala_binary_version}-1.0-SNAPSHOT-jar-with-dependencies.jar"
+  # Run integration tests with XGBoost4J
+  java -jar $xgboost4j_tester_jar
+
+  # Run integration tests with XGBoost4J-Spark
+  if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
+    # Changing directory so that we do not mix code and resulting files
+    cd target
+    if [[ "$scala_binary_version" == "2.12" ]]; then
+       /opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
+       /opt/spark-scala-2.12/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
+    elif [[ "$scala_binary_version" == "2.13" ]]; then
+      /opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkTraining --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv
+      /opt/spark-scala-2.13/bin/spark-submit --class ml.dmlc.xgboost4j.scala.example.spark.SparkMLlibPipeline --master 'local[8]' ${xgboost4j_tester_jar} $jvm_packages_dir/xgboost4j-tester/iris.csv ${PWD}/native_model-${scala_version} ${PWD}/pipeline_model-${scala_version}
+    else
+      echo "Unexpected scala version: $scala_version ($scala_binary_version)."
+    fi
+  fi
+  cd $jvm_packages_dir
+done
+
 set +x
 set +e

From d563d6d8f422c78d53be40a83b91a6d6b325443e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 28 May 2023 00:22:28 +0800
Subject: [PATCH 08/19] Bump scala-collection-compat_2.12 from 2.9.0 to 2.10.0
 in /jvm-packages (#9208)

Bumps [scala-collection-compat_2.12](https://github.com/scala/scala-collection-compat) from 2.9.0 to 2.10.0.
- [Release notes](https://github.com/scala/scala-collection-compat/releases)
- [Commits](https://github.com/scala/scala-collection-compat/compare/v2.9.0...v2.10.0)

---
updated-dependencies:
- dependency-name: org.scala-lang.modules:scala-collection-compat_2.12
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index fa0cb5344..a950f27d9 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -47,7 +47,7 @@
         <spark.rapids.version>23.04.1</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
         <scalatest.version>3.2.16</scalatest.version>
-        <scala-collection-compat.version>2.9.0</scala-collection-compat.version>
+        <scala-collection-compat.version>2.10.0</scala-collection-compat.version>
       </properties>
     <repositories>
         <repository>

From 03bc6e64270d19ffd634aa3009b11d3be42159d4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 28 May 2023 05:24:15 +0800
Subject: [PATCH 09/19] Remove unused variables. (#9210)

- remove used variables.
- Remove signed comparison warnings.
---
 src/collective/communicator-inl.h    | 2 +-
 src/common/partition_builder.h       | 3 +--
 src/tree/common_row_partitioner.h    | 2 +-
 src/tree/hist/evaluate_splits.h      | 4 ++--
 tests/cpp/tree/test_quantile_hist.cc | 1 -
 5 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index 8ae96dd8a..59cc4cc45 100644
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -252,7 +252,7 @@ inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
 
   // Calculate input offsets (std::exclusive_scan).
   std::vector<std::size_t> offsets(all_sizes.size());
-  for (auto i = 1; i < offsets.size(); i++) {
+  for (std::size_t i = 1; i < offsets.size(); i++) {
     offsets[i] = offsets[i - 1] + all_sizes[i - 1];
   }
 
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index cfa2f6a58..53e11bd91 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -263,8 +263,7 @@ class PartitionBuilder {
   template <typename ExpandEntry>
   void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                        const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
-                       const size_t* rid, BitVector const& decision_bits,
+                       const RegTree& tree, const size_t* rid, BitVector const& decision_bits,
                        BitVector const& missing_bits) {
     common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
     common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index ef12d0ccc..4360c0b13 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -67,7 +67,7 @@ class ColumnSplitHelper {
       const int32_t nid = nodes[node_in_set].nid;
       const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
       partition_builder_->AllocateForTask(task_id);
-      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
+      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
                                           (*row_set_collection_)[nid].begin, decision_bits_,
                                           missing_bits_);
     });
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index c8321b48a..4fb857e06 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -298,7 +298,7 @@ class HistEvaluator {
     std::vector<CPUExpandEntry> all_entries(num_entries * world);
     std::vector<uint32_t> cat_bits;
     std::vector<std::size_t> cat_bits_sizes;
-    for (auto i = 0; i < num_entries; i++) {
+    for (std::size_t i = 0; i < num_entries; i++) {
       all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
     }
     collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
@@ -586,7 +586,7 @@ class HistMultiEvaluator {
     std::vector<uint32_t> cat_bits;
     std::vector<std::size_t> cat_bits_sizes;
     std::vector<GradientPairPrecise> gradients;
-    for (auto i = 0; i < num_entries; i++) {
+    for (std::size_t i = 0; i < num_entries; i++) {
       all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
                                                          &gradients);
     }
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index b8d0fd2c0..7b99a5bf2 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -113,7 +113,6 @@ void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
 
   for (auto const& page : Xy->GetBatches<SparsePage>()) {
     GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
-    bst_feature_t const split_ind = 0;
     common::ColumnMatrix column_indices;
     column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
     {

From 320323f533cecd463cc6e236f776ea958e25b94d Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 29 May 2023 05:58:16 +0800
Subject: [PATCH 10/19] [pyspark] add parameters in the ctor of all estimators.
 (#9202)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/core.py      |  20 +-
 python-package/xgboost/spark/estimator.py | 261 ++++++++++++++++------
 2 files changed, 204 insertions(+), 77 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index d2eff943c..0181e678d 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -337,11 +337,9 @@ class _SparkXGBParams(
 
         if self.getOrDefault(self.features_cols):
             if not self.getOrDefault(self.use_gpu):
-                raise ValueError("features_cols param requires enabling use_gpu.")
-
-            get_logger(self.__class__.__name__).warning(
-                "If features_cols param set, then features_col param is ignored."
-            )
+                raise ValueError(
+                    "features_col param with list value requires enabling use_gpu."
+                )
 
         if self.getOrDefault("objective") is not None:
             if not isinstance(self.getOrDefault("objective"), str):
@@ -547,6 +545,8 @@ FeatureProp = namedtuple(
 
 
 class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
+    _input_kwargs: Dict[str, Any]
+
     def __init__(self) -> None:
         super().__init__()
         self._set_xgb_params_default()
@@ -576,6 +576,11 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             raise ValueError("Invalid param name: 'arbitrary_params_dict'.")
 
         for k, v in kwargs.items():
+            # We're not allowing user use features_cols directly.
+            if k == self.features_cols.name:
+                raise ValueError(
+                    f"Unsupported param '{k}' please use features_col instead."
+                )
             if k in _inverse_pyspark_param_alias_map:
                 raise ValueError(
                     f"Please use param name {_inverse_pyspark_param_alias_map[k]} instead."
@@ -591,7 +596,10 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                     k = real_k
 
             if self.hasParam(k):
-                self._set(**{str(k): v})
+                if k == "features_col" and isinstance(v, list):
+                    self._set(**{"features_cols": v})
+                else:
+                    self._set(**{str(k): v})
             else:
                 if (
                     k in _unsupported_xgb_params
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 67eea4ad2..5054ef0dd 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -1,10 +1,13 @@
 """Xgboost pyspark integration submodule for estimator API."""
 # pylint: disable=too-many-ancestors
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
+# pylint: disable=unused-argument, too-many-locals
 
-from typing import Any, Type
+
+from typing import Any, Dict, List, Optional, Type, Union
 
 import numpy as np
+from pyspark import keyword_only
 from pyspark.ml.param import Param, Params
 from pyspark.ml.param.shared import HasProbabilityCol, HasRawPredictionCol
 
@@ -83,8 +86,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     :py:class:`~pyspark.ml.classification.OneVsRest`
 
     SparkXGBRegressor automatically supports most of the parameters in
-    `xgboost.XGBRegressor` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBRegressor` fit and predict method.
+    :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
 
     SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
     see doc below for more details.
@@ -97,13 +100,23 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
     param for each xgboost worker will be set equal to `spark.task.cpus` config value.
 
-    callbacks:
-        The export and import of the callback functions are at best effort.
-        For details, see :py:attr:`xgboost.spark.SparkXGBRegressor.callbacks` param doc.
-    validation_indicator_col
-        For params related to `xgboost.XGBRegressor` training
-        with evaluation dataset's supervision, set
-        :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col`
+
+    Parameters
+    ----------
+
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    pred_contrib_col:
+        Contribution prediction column name.
+    validation_indicator_col:
+        For params related to `xgboost.XGBRegressor` training with
+        evaluation dataset's supervision,
+        set :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col`
         parameter instead of setting the `eval_set` parameter in `xgboost.XGBRegressor`
         fit method.
     weight_col:
@@ -111,26 +124,40 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         :py:attr:`xgboost.spark.SparkXGBRegressor.weight_col` parameter instead of setting
         `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRegressor`
         fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBRegressorModel.get_booster`.
-    num_workers:
-        Integer that specifies the number of XGBoost workers to use.
-        Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        Boolean that specifies whether the executors are running on GPU
-        instances.
     base_margin_col:
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBRegressor.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBRegressor` fit method. Note: this isn't available for distributed
-        training.
+        `xgboost.XGBRegressor` fit method.
 
-    .. Note:: The Parameters chart above contains parameters that need special handling.
-        For a full list of parameters, see entries with `Param(parent=...` below.
+    num_workers:
+        How many XGBoost workers to be used to train.
+        Each XGBoost worker corresponds to one spark task.
+    use_gpu:
+        Boolean value to specify whether the executors are running on GPU
+        instances.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
+    repartition_random_shuffle:
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
+
+    kwargs:
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html
+
+    Note
+    ----
+
+    The Parameters chart above contains parameters that need special handling.
+    For a full list of parameters, see entries with `Param(parent=...` below.
+
+    This API is experimental.
 
-    .. Note:: This API is experimental.
 
     Examples
     --------
@@ -155,9 +182,27 @@ class SparkXGBRegressor(_SparkXGBEstimator):
 
     """
 
-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **kwargs: Dict[str, Any],
+    ) -> None:
         super().__init__()
-        self.setParams(**kwargs)
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)
 
     @classmethod
     def _xgb_cls(cls) -> Type[XGBRegressor]:
@@ -199,8 +244,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     :py:class:`~pyspark.ml.classification.OneVsRest`
 
     SparkXGBClassifier automatically supports most of the parameters in
-    `xgboost.XGBClassifier` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBClassifier` fit and predict method.
+    :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
 
     SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
     see doc below for more details.
@@ -220,13 +265,21 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     Parameters
     ----------
 
-    callbacks:
-        The export and import of the callback functions are at best effort. For
-        details, see :py:attr:`xgboost.spark.SparkXGBClassifier.callbacks` param doc.
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    probability_col:
+        Column name for predicted class conditional probabilities. Default to probabilityCol
     raw_prediction_col:
         The `output_margin=True` is implicitly supported by the
         `rawPredictionCol` output column, which is always returned with the predicted margin
         values.
+    pred_contrib_col:
+        Contribution prediction column name.
     validation_indicator_col:
         For params related to `xgboost.XGBClassifier` training with
         evaluation dataset's supervision,
@@ -238,26 +291,39 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
         `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
         fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBClassifierModel.get_booster`.
-    num_workers:
-        Integer that specifies the number of XGBoost workers to use.
-        Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        Boolean that specifies whether the executors are running on GPU
-        instances.
     base_margin_col:
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
-        training.
+        `xgboost.XGBClassifier` fit method.
 
-    .. Note:: The Parameters chart above contains parameters that need special handling.
-        For a full list of parameters, see entries with `Param(parent=...` below.
+    num_workers:
+        How many XGBoost workers to be used to train.
+        Each XGBoost worker corresponds to one spark task.
+    use_gpu:
+        Boolean value to specify whether the executors are running on GPU
+        instances.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
+    repartition_random_shuffle:
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
 
-    .. Note:: This API is experimental.
+    kwargs:
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html
+
+    Note
+    ----
+
+    The Parameters chart above contains parameters that need special handling.
+    For a full list of parameters, see entries with `Param(parent=...` below.
+
+    This API is experimental.
 
     Examples
     --------
@@ -281,14 +347,34 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
 
     """
 
-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        probability_col: str = "probability",
+        raw_prediction_col: str = "rawPrediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **kwargs: Dict[str, Any],
+    ) -> None:
         super().__init__()
         # The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
         # but in pyspark we will automatically set objective param depending on
         # binary or multinomial input dataset, and we need to remove the fixed default
         # param value as well to avoid causing ambiguity.
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)
         self._setDefault(objective=None)
-        self.setParams(**kwargs)
 
     @classmethod
     def _xgb_cls(cls) -> Type[XGBClassifier]:
@@ -334,8 +420,8 @@ class SparkXGBRanker(_SparkXGBEstimator):
     :py:class:`~pyspark.ml.classification.OneVsRest`
 
     SparkXGBRanker automatically supports most of the parameters in
-    `xgboost.XGBRanker` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBRanker` fit and predict method.
+    :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.
 
     SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
     see doc below for more details.
@@ -355,39 +441,53 @@ class SparkXGBRanker(_SparkXGBEstimator):
     Parameters
     ----------
 
-    callbacks:
-        The export and import of the callback functions are at best effort. For
-        details, see :py:attr:`xgboost.spark.SparkXGBRanker.callbacks` param doc.
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    pred_contrib_col:
+        Contribution prediction column name.
     validation_indicator_col:
         For params related to `xgboost.XGBRanker` training with
         evaluation dataset's supervision,
-        set :py:attr:`xgboost.spark.XGBRanker.validation_indicator_col`
-        parameter instead of setting the `eval_set` parameter in `xgboost.XGBRanker`
+        set :py:attr:`xgboost.spark.SparkXGBRanker.validation_indicator_col`
+        parameter instead of setting the `eval_set` parameter in :py:class:`xgboost.XGBRanker`
         fit method.
     weight_col:
         To specify the weight of the training and validation dataset, set
         :py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting
-        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRanker`
+        `sample_weight` and `sample_weight_eval_set` parameter in :py:class:`xgboost.XGBRanker`
         fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBRankerModel.get_booster`.
-    num_workers:
-        Integer that specifies the number of XGBoost workers to use.
-        Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        Boolean that specifies whether the executors are running on GPU
-        instances.
     base_margin_col:
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBRanker` fit method.
+        :py:class:`xgboost.XGBRanker` fit method.
     qid_col:
-        To specify the qid of the training and validation
-        dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.qid_col` parameter
-        instead of setting `qid` / `group`, `eval_qid` / `eval_group` in the
-        `xgboost.XGBRanker` fit method.
+        Query id column name.
+
+    num_workers:
+        How many XGBoost workers to be used to train.
+        Each XGBoost worker corresponds to one spark task.
+    use_gpu:
+        Boolean value to specify whether the executors are running on GPU
+        instances.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
+    repartition_random_shuffle:
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
+
+    kwargs:
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html
 
     .. Note:: The Parameters chart above contains parameters that need special handling.
         For a full list of parameters, see entries with `Param(parent=...` below.
@@ -426,9 +526,28 @@ class SparkXGBRanker(_SparkXGBEstimator):
     >>> model.transform(df_test).show()
     """
 
-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        qid_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **kwargs: Dict[str, Any],
+    ) -> None:
         super().__init__()
-        self.setParams(**kwargs)
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)
 
     @classmethod
     def _xgb_cls(cls) -> Type[XGBRanker]:

From ddec0f378ceb64d91521b687850e772d1d92a7fa Mon Sep 17 00:00:00 2001
From: Jean Lescut-Muller <128988532+JeanLescutMuller@users.noreply.github.com>
Date: Mon, 29 May 2023 22:07:12 +0200
Subject: [PATCH 11/19] [doc] Show derivative of the custom objective (#9213)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 doc/tutorials/custom_metric_obj.rst | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/doc/tutorials/custom_metric_obj.rst b/doc/tutorials/custom_metric_obj.rst
index 8a2fcb718..c6d5fbff5 100644
--- a/doc/tutorials/custom_metric_obj.rst
+++ b/doc/tutorials/custom_metric_obj.rst
@@ -27,20 +27,29 @@ In the following two sections, we will provide a step by step walk through of im
 the ``Squared Log Error (SLE)`` objective function:
 
 .. math::
-   \frac{1}{2}[log(pred + 1) - log(label + 1)]^2
+   \frac{1}{2}[\log(pred + 1) - \log(label + 1)]^2
 
 and its default metric ``Root Mean Squared Log Error(RMSLE)``:
 
 .. math::
-   \sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}
+   \sqrt{\frac{1}{N}[\log(pred + 1) - \log(label + 1)]^2}
 
 Although XGBoost has native support for said functions, using it for demonstration
 provides us the opportunity of comparing the result from our own implementation and the
 one from XGBoost internal for learning purposes.  After finishing this tutorial, we should
 be able to provide our own functions for rapid experiments.  And at the end, we will
 provide some notes on non-identy link function along with examples of using custom metric
-and objective with `scikit-learn` interface.
-with scikit-learn interface.
+and objective with the `scikit-learn` interface.
+
+If we compute the gradient of said objective function:
+
+.. math::
+   g = \frac{\partial{objective}}{\partial{pred}} = \frac{\log(pred + 1) - \log(label + 1)}{pred + 1}
+
+As well as the hessian (the second derivative of the objective):
+
+.. math::
+   h = \frac{\partial^2{objective}}{\partial{pred}} = \frac{ - \log(pred + 1) + \log(label + 1) + 1}{(pred + 1)^2}
 
 *****************************
 Customized Objective Function

From ae7450ce543aafead07fa8953de40f4cf34d262d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 30 May 2023 17:23:09 +0800
Subject: [PATCH 12/19] Skip optional synchronization in thrust. (#9212)

---
 src/common/cuda_context.cuh | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/common/cuda_context.cuh b/src/common/cuda_context.cuh
index 9056c1b5e..c8b2e0792 100644
--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
@@ -17,11 +17,23 @@ struct CUDAContext {
   /**
    * \brief Caching thrust policy.
    */
-  auto CTP() const { return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream()); }
+  auto CTP() const {
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
+#else
+    return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
+  }
   /**
    * \brief Thrust policy without caching allocator.
    */
-  auto TP() const { return thrust::cuda::par(alloc_).on(dh::DefaultStream()); }
+  auto TP() const {
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::cuda::par_nosync(alloc_).on(dh::DefaultStream());
+#else
+    return thrust::cuda::par(alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
+  }
   auto Stream() const { return dh::DefaultStream(); }
 };
 }  // namespace xgboost

From 6f83d9c69a2ee42281007ccae95cfd30e5da78c7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 30 May 2023 19:10:07 +0800
Subject: [PATCH 13/19] Bump maven-project-info-reports-plugin in /jvm-packages
 (#9219)

Bumps [maven-project-info-reports-plugin](https://github.com/apache/maven-project-info-reports-plugin) from 3.4.3 to 3.4.4.
- [Commits](https://github.com/apache/maven-project-info-reports-plugin/compare/maven-project-info-reports-plugin-3.4.3...maven-project-info-reports-plugin-3.4.4)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-project-info-reports-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index a950f27d9..142137654 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -462,7 +462,7 @@
         <plugins>
             <plugin>
                 <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.4.3</version>
+                <version>3.4.4</version>
             </plugin>
             <plugin>
                 <groupId>net.alchim31.maven</groupId>

From 097f11b6e0003f700857bc5f225526694ae79752 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 30 May 2023 20:54:31 +0800
Subject: [PATCH 14/19] Support CUDA f16 without transformation. (#9207)

- Support f16 from cupy.
- Include CUDA header explicitly.
- Cleanup cmake nvtx support.
---
 CMakeLists.txt                 |  2 ++
 cmake/Utils.cmake              | 16 ++++++----------
 cmake/modules/FindNVTX.cmake   | 26 -------------------------
 python-package/xgboost/data.py |  2 +-
 src/data/array_interface.h     | 35 +++++++++++++++++-----------------
 5 files changed, 27 insertions(+), 54 deletions(-)
 delete mode 100644 cmake/modules/FindNVTX.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7953a10dd..ede6c5b75 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -149,6 +149,8 @@ if (USE_CUDA)
   set(GEN_CODE "")
   format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
   add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
+
+  find_package(CUDAToolkit REQUIRED)
 endif (USE_CUDA)
 
 if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 57a45ca42..dc523d03a 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -124,13 +124,6 @@ function(format_gencode_flags flags out)
   endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
 endfunction(format_gencode_flags flags)
 
-macro(enable_nvtx target)
-  find_package(NVTX REQUIRED)
-  target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}")
-  target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
-  target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
-endmacro()
-
 # Set CUDA related flags to target.  Must be used after code `format_gencode_flags`.
 function(xgboost_set_cuda_flags target)
   target_compile_options(${target} PRIVATE
@@ -162,11 +155,14 @@ function(xgboost_set_cuda_flags target)
   endif (USE_DEVICE_DEBUG)
 
   if (USE_NVTX)
-    enable_nvtx(${target})
+    target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
   endif (USE_NVTX)
 
   target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
-  target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/gputreeshap)
+  target_include_directories(
+    ${target} PRIVATE
+    ${xgboost_SOURCE_DIR}/gputreeshap
+    ${CUDAToolkit_INCLUDE_DIRS})
 
   if (MSVC)
     target_compile_options(${target} PRIVATE
@@ -289,7 +285,7 @@ macro(xgboost_target_link_libraries target)
   endif (USE_NCCL)
 
   if (USE_NVTX)
-    enable_nvtx(${target})
+    target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
   endif (USE_NVTX)
 
   if (RABIT_BUILD_MPI)
diff --git a/cmake/modules/FindNVTX.cmake b/cmake/modules/FindNVTX.cmake
deleted file mode 100644
index 173e255c8..000000000
--- a/cmake/modules/FindNVTX.cmake
+++ /dev/null
@@ -1,26 +0,0 @@
-if (NVTX_LIBRARY)
-  unset(NVTX_LIBRARY CACHE)
-endif (NVTX_LIBRARY)
-
-set(NVTX_LIB_NAME nvToolsExt)
-
-
-find_path(NVTX_INCLUDE_DIR
-  NAMES nvToolsExt.h
-  PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include)
-
-
-find_library(NVTX_LIBRARY
-  NAMES nvToolsExt
-  PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64)
-
-message(STATUS "Using nvtx library: ${NVTX_LIBRARY}")
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NVTX DEFAULT_MSG
-                                  NVTX_INCLUDE_DIR NVTX_LIBRARY)
-
-mark_as_advanced(
-  NVTX_INCLUDE_DIR
-  NVTX_LIBRARY
-)
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 2ebde84f0..5e1a309e0 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -882,7 +882,7 @@ def _transform_cupy_array(data: DataType) -> CupyT:
 
     if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"):
         data = cupy.array(data, copy=False)
-    if data.dtype.hasobject or data.dtype in [cupy.float16, cupy.bool_]:
+    if data.dtype.hasobject or data.dtype in [cupy.bool_]:
         data = data.astype(cupy.float32, copy=False)
     return data
 
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index fee22203c..1b18f140a 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -26,6 +26,10 @@
 #include "xgboost/logging.h"
 #include "xgboost/span.h"
 
+#if defined(XGBOOST_USE_CUDA)
+#include "cuda_fp16.h"
+#endif
+
 namespace xgboost {
 // Common errors in parsing columnar format.
 struct ArrayInterfaceErrors {
@@ -304,12 +308,12 @@ class ArrayInterfaceHandler {
 template <typename T, typename E = void>
 struct ToDType;
 // float
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#if defined(XGBOOST_USE_CUDA)
 template <>
 struct ToDType<__half> {
   static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
 };
-#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#endif  // defined(XGBOOST_USE_CUDA)
 template <>
 struct ToDType<float> {
   static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
@@ -459,11 +463,11 @@ class ArrayInterface {
       CHECK(sizeof(long double) == 16) << error::NoF128();
       type = T::kF16;
     } else if (typestr[1] == 'f' && typestr[2] == '2') {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#if defined(XGBOOST_USE_CUDA)
       type = T::kF2;
 #else
       LOG(FATAL) << "Half type is not supported.";
-#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#endif  // defined(XGBOOST_USE_CUDA)
     } else if (typestr[1] == 'f' && typestr[2] == '4') {
       type = T::kF4;
     } else if (typestr[1] == 'f' && typestr[2] == '8') {
@@ -490,20 +494,17 @@ class ArrayInterface {
     }
   }
 
-  XGBOOST_DEVICE size_t Shape(size_t i) const { return shape[i]; }
-  XGBOOST_DEVICE size_t Stride(size_t i) const { return strides[i]; }
+  [[nodiscard]] XGBOOST_DEVICE std::size_t Shape(size_t i) const { return shape[i]; }
+  [[nodiscard]] XGBOOST_DEVICE std::size_t Stride(size_t i) const { return strides[i]; }
 
   template <typename Fn>
   XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const {
     using T = ArrayInterfaceHandler::Type;
     switch (type) {
       case T::kF2: {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#if defined(XGBOOST_USE_CUDA)
         return func(reinterpret_cast<__half const *>(data));
-#else
-        SPAN_CHECK(false);
-        return func(reinterpret_cast<float const *>(data));
-#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#endif  // defined(XGBOOST_USE_CUDA)
       }
       case T::kF4:
         return func(reinterpret_cast<float const *>(data));
@@ -540,23 +541,23 @@ class ArrayInterface {
     return func(reinterpret_cast<uint64_t const *>(data));
   }
 
-  XGBOOST_DEVICE std::size_t ElementSize() const {
+  [[nodiscard]] XGBOOST_DEVICE std::size_t ElementSize() const {
     return this->DispatchCall([](auto *typed_data_ptr) {
       return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
     });
   }
-  XGBOOST_DEVICE std::size_t ElementAlignment() const {
+  [[nodiscard]] XGBOOST_DEVICE std::size_t ElementAlignment() const {
     return this->DispatchCall([](auto *typed_data_ptr) {
       return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
     });
   }
 
   template <typename T = float, typename... Index>
-  XGBOOST_DEVICE T operator()(Index &&...index) const {
+  XGBOOST_HOST_DEV_INLINE T operator()(Index &&...index) const {
     static_assert(sizeof...(index) <= D, "Invalid index.");
     return this->DispatchCall([=](auto const *p_values) -> T {
       std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#if defined(XGBOOST_USE_CUDA)
       // No operator defined for half -> size_t
       using Type = std::conditional_t<
           std::is_same<__half,
@@ -566,7 +567,7 @@ class ArrayInterface {
       return static_cast<T>(static_cast<Type>(p_values[offset]));
 #else
       return static_cast<T>(p_values[offset]);
-#endif
+#endif  // defined(XGBOOST_USE_CUDA)
     });
   }
 
@@ -603,7 +604,7 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
   };
   switch (array.type) {
     case ArrayInterfaceHandler::kF2: {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#if defined(XGBOOST_USE_CUDA)
       dispatch(__half{});
 #endif
       break;

From 17fd3f55e97d9d4718dbdfeb23dddc8b0f96fd02 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 30 May 2023 23:28:43 +0800
Subject: [PATCH 15/19] Optimize adapter element counting on GPU. (#9209)

- Implement a simple `IterSpan` for passing iterators with size.
- Use shared memory for column size counts.
- Use one thread for each sample in row count to reduce atomic operations.
---
 include/xgboost/span.h                |  51 +++++++--
 src/common/hist_util.cu               |  10 +-
 src/common/hist_util.cuh              | 143 +++++++++++++++++++++-----
 src/data/device_adapter.cuh           |  55 ++++++++--
 src/data/iterative_dmatrix.cu         |   4 +-
 tests/cpp/common/test_hist_util.cu    |  67 ++++++++++++
 tests/cpp/common/test_quantile.cu     |   2 +-
 tests/cpp/common/test_span.cc         |  33 ++++--
 tests/cpp/data/test_device_adapter.cu |  19 ++++
 9 files changed, 323 insertions(+), 61 deletions(-)

diff --git a/include/xgboost/span.h b/include/xgboost/span.h
index 0b543b537..be8640f73 100644
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2023, XGBoost contributors
  * \brief span class based on ISO++20 span
  *
  * About NOLINTs in this file:
@@ -32,11 +32,12 @@
 #include <xgboost/base.h>
 #include <xgboost/logging.h>
 
-#include <cinttypes>          // size_t
-#include <limits>             // numeric_limits
-#include <iterator>
-#include <type_traits>
+#include <cinttypes>  // size_t
 #include <cstdio>
+#include <iterator>
+#include <limits>  // numeric_limits
+#include <type_traits>
+#include <utility>  // for move
 
 #if defined(__CUDACC__)
 #include <cuda_runtime.h>
@@ -668,6 +669,44 @@ XGBOOST_DEVICE auto as_writable_bytes(Span<T, E> s) __span_noexcept ->  // NOLIN
     Span<byte, detail::ExtentAsBytesValue<T, E>::value> {
   return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
 }
+
+/**
+ * \brief A simple custom Span type that uses general iterator instead of pointer.
+ */
+template <typename It>
+class IterSpan {
+ public:
+  using value_type = typename std::iterator_traits<It>::value_type;  // NOLINT
+  using index_type = std::size_t;                                    // NOLINT
+  using iterator = It;                                               // NOLINT
+
+ private:
+  It it_;
+  index_type size_{0};
+
+ public:
+  IterSpan() = default;
+  XGBOOST_DEVICE IterSpan(It it, index_type size) : it_{std::move(it)}, size_{size} {}
+  XGBOOST_DEVICE explicit IterSpan(common::Span<It, dynamic_extent> span)
+      : it_{span.data()}, size_{span.size()} {}
+
+  [[nodiscard]] XGBOOST_DEVICE index_type size() const noexcept { return size_; }  // NOLINT
+  [[nodiscard]] XGBOOST_DEVICE decltype(auto) operator[](index_type i) const { return it_[i]; }
+  [[nodiscard]] XGBOOST_DEVICE decltype(auto) operator[](index_type i) { return it_[i]; }
+  [[nodiscard]] XGBOOST_DEVICE bool empty() const noexcept { return size() == 0; }  // NOLINT
+  [[nodiscard]] XGBOOST_DEVICE It data() const noexcept { return it_; }             // NOLINT
+  [[nodiscard]] XGBOOST_DEVICE IterSpan<It> subspan(                                // NOLINT
+      index_type _offset, index_type _count = dynamic_extent) const {
+    SPAN_CHECK((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
+    return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
+  }
+  [[nodiscard]] XGBOOST_DEVICE constexpr iterator begin() const noexcept {  // NOLINT
+    return {this, 0};
+  }
+  [[nodiscard]] XGBOOST_DEVICE constexpr iterator end() const noexcept {  // NOLINT
+    return {this, size()};
+  }
+};
 }  // namespace common
 }  // namespace xgboost
 
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 08ef98ea1..ae86129bf 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -204,9 +204,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
         return {0, e.index, e.fvalue};  // row_idx is not needed for scanning column size.
       });
   detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
-                             batch_it, dummy_is_valid,
-                             0, sorted_entries.size(),
-                             &cuts_ptr, &column_sizes_scan);
+                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
+                             &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
 
   if (sketch_container->HasCategorical()) {
@@ -273,9 +272,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
         return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
       });
   detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
-                             batch_it, dummy_is_valid,
-                             0, sorted_entries.size(),
-                             &cuts_ptr, &column_sizes_scan);
+                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
+                             &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
   if (sketch_container->HasCategorical()) {
     detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 856404107..77424d7fa 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -53,32 +53,126 @@ struct EntryCompareOp {
 };
 
 // Get column size from adapter batch and for output cuts.
-template <typename Iter>
-void GetColumnSizesScan(int device, size_t num_columns, size_t num_cuts_per_feature,
-                        Iter batch_iter, data::IsValidFunctor is_valid,
-                        size_t begin, size_t end,
-                        HostDeviceVector<SketchContainer::OffsetT> *cuts_ptr,
+template <std::uint32_t kBlockThreads, typename CounterT, typename BatchIt>
+__global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
+                                             data::IsValidFunctor is_valid,
+                                             Span<std::size_t> out_column_size) {
+  extern __shared__ char smem[];
+
+  auto smem_cs_ptr = reinterpret_cast<CounterT*>(smem);
+
+  dh::BlockFill(smem_cs_ptr, out_column_size.size(), 0);
+
+  cub::CTA_SYNC();
+
+  auto n = batch_iter.size();
+
+  for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n)) {
+    auto e = batch_iter[idx];
+    if (is_valid(e)) {
+      atomicAdd(&smem_cs_ptr[e.column_idx], static_cast<CounterT>(1));
+    }
+  }
+
+  cub::CTA_SYNC();
+
+  auto out_global_ptr = out_column_size;
+  for (auto i : dh::BlockStrideRange(static_cast<std::size_t>(0), out_column_size.size())) {
+    atomicAdd(&out_global_ptr[i], static_cast<std::size_t>(smem_cs_ptr[i]));
+  }
+}
+
+template <std::uint32_t kBlockThreads, typename Kernel>
+std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+  int n_mps = 0;
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+  int n_blocks_per_mp = 0;
+  dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+                                                              kBlockThreads, shared_mem));
+  std::uint32_t grid_size = n_blocks_per_mp * n_mps;
+  return grid_size;
+}
+
+/**
+ * \brief Get the size of each column. This is a histogram with additional handling of
+ *        invalid values.
+ *
+ * \tparam BatchIt                 Type of input adapter batch.
+ * \tparam force_use_global_memory Used for testing. Force global atomic add.
+ * \tparam force_use_u64           Used for testing. For u64 as counter in shared memory.
+ *
+ * \param device     CUDA device ordinal.
+ * \param batch_iter Iterator for input data from adapter batch.
+ * \param is_valid   Whehter an element is considered as missing.
+ * \param out_column_size Output buffer for the size of each column.
+ */
+template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
+void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+                               data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
+  thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
+
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  // Not strictly correct as we should use number of samples to determine the type of
+  // counter. However, the sample size is not known due to sliding window on number of
+  // elements.
+  std::size_t n = batch_iter.size();
+
+  std::size_t required_shared_memory = 0;
+  bool use_u32{false};
+  if (!force_use_u64 && n < static_cast<std::size_t>(std::numeric_limits<std::uint32_t>::max())) {
+    required_shared_memory = out_column_size.size() * sizeof(std::uint32_t);
+    use_u32 = true;
+  } else {
+    required_shared_memory = out_column_size.size() * sizeof(std::size_t);
+    use_u32 = false;
+  }
+  bool use_shared = required_shared_memory <= max_shared_memory && required_shared_memory != 0;
+
+  if (!force_use_global_memory && use_shared) {
+    CHECK_NE(required_shared_memory, 0);
+    std::uint32_t constexpr kBlockThreads = 512;
+    if (use_u32) {
+      CHECK(!force_use_u64);
+      auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
+      auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+          kernel, batch_iter, is_valid, out_column_size);
+    } else {
+      auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
+      auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+          kernel, batch_iter, is_valid, out_column_size);
+    }
+  } else {
+    auto d_out_column_size = out_column_size;
+    dh::LaunchN(batch_iter.size(), [=] __device__(size_t idx) {
+      auto e = batch_iter[idx];
+      if (is_valid(e)) {
+        atomicAdd(&d_out_column_size[e.column_idx], static_cast<size_t>(1));
+      }
+    });
+  }
+}
+
+template <typename BatchIt>
+void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+                        IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
+                        HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                         dh::caching_device_vector<size_t>* column_sizes_scan) {
-  column_sizes_scan->resize(num_columns + 1, 0);
+  column_sizes_scan->resize(num_columns + 1);
   cuts_ptr->SetDevice(device);
   cuts_ptr->Resize(num_columns + 1, 0);
 
   dh::XGBCachingDeviceAllocator<char> alloc;
-  auto d_column_sizes_scan = column_sizes_scan->data().get();
-  dh::LaunchN(end - begin, [=] __device__(size_t idx) {
-    auto e = batch_iter[begin + idx];
-    if (is_valid(e)) {
-      atomicAdd(&d_column_sizes_scan[e.column_idx], static_cast<size_t>(1));
-    }
-  });
+  auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan);
+  LaunchGetColumnSizeKernel(device, batch_iter, is_valid, d_column_sizes_scan);
   // Calculate cuts CSC pointer
   auto cut_ptr_it = dh::MakeTransformIterator<size_t>(
       column_sizes_scan->begin(), [=] __device__(size_t column_size) {
         return thrust::min(num_cuts_per_feature, column_size);
       });
   thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
-                         cut_ptr_it + column_sizes_scan->size(),
-                         cuts_ptr->DevicePointer());
+                         cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
   thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
                          column_sizes_scan->end(), column_sizes_scan->begin());
 }
@@ -121,29 +215,26 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
 
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
-void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
-                            Range1d range, float missing,
-                            size_t columns, size_t cuts_per_feature, int device,
+void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
+                            float missing, size_t columns, size_t cuts_per_feature, int device,
                             HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                             dh::caching_device_vector<size_t>* column_sizes_scan,
                             dh::device_vector<Entry>* sorted_entries) {
   auto entry_iter = dh::MakeTransformIterator<Entry>(
       thrust::make_counting_iterator(0llu), [=] __device__(size_t idx) {
-        return Entry(batch.GetElement(idx).column_idx,
-                     batch.GetElement(idx).value);
+        return Entry(batch.GetElement(idx).column_idx, batch.GetElement(idx).value);
       });
+  auto n = range.end() - range.begin();
+  auto span = IterSpan{batch_iter + range.begin(), n};
   data::IsValidFunctor is_valid(missing);
   // Work out how many valid entries we have in each column
-  GetColumnSizesScan(device, columns, cuts_per_feature,
-                     batch_iter, is_valid,
-                     range.begin(), range.end(),
-                     cut_sizes_scan,
+  GetColumnSizesScan(device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
                      column_sizes_scan);
   size_t num_valid = column_sizes_scan->back();
   // Copy current subset of valid elements into temporary storage and sort
   sorted_entries->resize(num_valid);
-  dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(),
-             sorted_entries->begin(), is_valid);
+  dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
+             is_valid);
 }
 
 void SortByWeight(dh::device_vector<float>* weights,
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 136fbb743..8c11d74c9 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -39,6 +39,14 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return {row_idx, column_idx, value};
   }
 
+  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+    auto const& column = columns_[fidx];
+    float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
+                      ? column(ridx)
+                      : std::numeric_limits<float>::quiet_NaN();
+    return value;
+  }
+
   XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
   XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
 
@@ -160,6 +168,10 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
     float value = array_interface_(row_idx, column_idx);
     return {row_idx, column_idx, value};
   }
+  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+    float value = array_interface_(ridx, fidx);
+    return value;
+  }
 
   XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
   XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
@@ -196,24 +208,47 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 
 // Returns maximum row length
 template <typename AdapterBatchT>
-size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
-                    int device_idx, float missing) {
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+                         float missing) {
   dh::safe_cuda(cudaSetDevice(device_idx));
   IsValidFunctor is_valid(missing);
+  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
+
+  auto n_samples = batch.NumRows();
+  bst_feature_t n_features = batch.NumCols();
+
+  // Use more than 1 threads for each row in case of dataset being too wide.
+  bst_feature_t stride{0};
+  if (n_features < 32) {
+    stride = std::min(n_features, 4u);
+  } else if (n_features < 64) {
+    stride = 8;
+  } else if (n_features < 128) {
+    stride = 16;
+  } else {
+    stride = 32;
+  }
+
   // Count elements per row
-  dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
-    auto element = batch.GetElement(idx);
-    if (is_valid(element)) {
-      atomicAdd(reinterpret_cast<unsigned long long*>(  // NOLINT
-                    &offset[element.row_idx]),
-                static_cast<unsigned long long>(1));  // NOLINT
+  dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
+    bst_row_t cnt{0};
+    auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
+    SPAN_CHECK(ridx < n_samples);
+    for (bst_feature_t fidx = fbeg; fidx < n_features; fidx += stride) {
+      if (is_valid(batch.GetElement(ridx, fidx))) {
+        cnt++;
+      }
     }
+
+    atomicAdd(reinterpret_cast<unsigned long long*>(  // NOLINT
+                  &offset[ridx]),
+              static_cast<unsigned long long>(cnt));  // NOLINT
   });
   dh::XGBCachingDeviceAllocator<char> alloc;
-  size_t row_stride =
+  bst_row_t row_stride =
       dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
                  thrust::device_pointer_cast(offset.data()) + offset.size(),
-                 static_cast<std::size_t>(0), thrust::maximum<size_t>());
+                 static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
   return row_stride;
 }
 
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index f23bbd5a1..881b54297 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -80,7 +80,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     }
     auto batch_rows = num_rows();
     accumulated_rows += batch_rows;
-    dh::caching_device_vector<size_t> row_counts(batch_rows + 1, 0);
+    dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
     row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
                             return GetRowCounts(value, row_counts_span, get_device(), missing);
@@ -134,7 +134,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     init_page();
     dh::safe_cuda(cudaSetDevice(get_device()));
     auto rows = num_rows();
-    dh::caching_device_vector<size_t> row_counts(rows + 1, 0);
+    dh::device_vector<size_t> row_counts(rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
     Dispatch(proxy, [=](auto const& value) {
       return GetRowCounts(value, row_counts_span, get_device(), missing);
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index e907a9f72..4d4e99ca9 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -483,6 +483,73 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
   }
 }
 
+namespace {
+auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
+  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  auto n = n_samples * n_features;
+  std::vector<float> x;
+  x.resize(n);
+
+  std::iota(x.begin(), x.end(), 0);
+  std::int32_t c{0};
+  float missing = n_samples * n_features;
+  for (std::size_t i = 0; i < x.size(); ++i) {
+    if (i % 5 == 0) {
+      x[i] = missing;
+      c++;
+    }
+  }
+  thrust::device_vector<float> d_x;
+  d_x = x;
+
+  auto n_invalids = n / 10 * 2 + 1;
+  auto is_valid = data::IsValidFunctor{missing};
+  return std::tuple{x, d_x, n_invalids, is_valid};
+}
+
+void TestGetColumnSize(std::size_t n_samples) {
+  auto ctx = MakeCUDACtx(0);
+  bst_feature_t n_features = 12;
+  [[maybe_unused]] auto [x, d_x, n_invalids, is_valid] = MakeData(&ctx, n_samples, n_features);
+
+  auto adapter = AdapterFromData(d_x, n_samples, n_features);
+  auto batch = adapter.Value();
+
+  auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
+      thrust::make_counting_iterator(0llu),
+      [=] __device__(std::size_t idx) { return batch.GetElement(idx); });
+
+  dh::caching_device_vector<std::size_t> column_sizes_scan;
+  column_sizes_scan.resize(n_features + 1);
+  std::vector<std::size_t> h_column_size(column_sizes_scan.size());
+  std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
+
+  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
+      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
+
+  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
+      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
+  ASSERT_EQ(h_column_size, h_column_size_1);
+
+  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
+      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
+  ASSERT_EQ(h_column_size, h_column_size_1);
+
+  detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
+      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+  thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
+  ASSERT_EQ(h_column_size, h_column_size_1);
+}
+}  // namespace
+
+TEST(HistUtil, GetColumnSize) {
+  bst_row_t n_samples = 4096;
+  TestGetColumnSize(n_samples);
+}
+
 // Check sketching from adapter or DMatrix results in the same answer
 // Consistency here is useful for testing and user experience
 TEST(HistUtil, SketchingEquivalent) {
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index f36334bcc..3a8e6d046 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -50,7 +50,7 @@ void TestSketchUnique(float sparsity) {
         thrust::make_counting_iterator(0llu),
         [=] __device__(size_t idx) { return batch.GetElement(idx); });
     auto end = kCols * kRows;
-    detail::GetColumnSizesScan(0, kCols, n_cuts, batch_iter, is_valid, 0, end,
+    detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
                                &cut_sizes_scan, &column_sizes_scan);
     auto const& cut_sizes = cut_sizes_scan.HostVector();
     ASSERT_LE(sketch.Data().size(), cut_sizes.back());
diff --git a/tests/cpp/common/test_span.cc b/tests/cpp/common/test_span.cc
index 133fae9fd..b29c562bc 100644
--- a/tests/cpp/common/test_span.cc
+++ b/tests/cpp/common/test_span.cc
@@ -1,15 +1,16 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2023, XGBoost contributors
  */
-#include <gtest/gtest.h>
-#include <vector>
-
-#include <xgboost/span.h>
 #include "test_span.h"
 
-namespace xgboost {
-namespace common {
+#include <gtest/gtest.h>
+#include <xgboost/span.h>
 
+#include <vector>
+
+#include "../../../src/common/transform_iterator.h"  // for MakeIndexTransformIter
+
+namespace xgboost::common {
 TEST(Span, TestStatus) {
   int status = 1;
   TestTestStatus {&status}();
@@ -526,5 +527,17 @@ TEST(SpanDeathTest, Empty) {
   Span<float> s{data.data(), static_cast<Span<float>::index_type>(0)};
   EXPECT_DEATH(s[0], "");  // not ok to use it.
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(IterSpan, Basic) {
+  auto iter = common::MakeIndexTransformIter([](std::size_t i) { return i; });
+  std::size_t n = 13;
+  auto span = IterSpan{iter, n};
+  ASSERT_EQ(span.size(), n);
+  for (std::size_t i = 0; i < n; ++i) {
+    ASSERT_EQ(span[i], i);
+  }
+  ASSERT_EQ(span.subspan(1).size(), n - 1);
+  ASSERT_EQ(span.subspan(1)[0], 1);
+  ASSERT_EQ(span.subspan(1, 2)[1], 2);
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/data/test_device_adapter.cu b/tests/cpp/data/test_device_adapter.cu
index f62b3dd80..db70c216c 100644
--- a/tests/cpp/data/test_device_adapter.cu
+++ b/tests/cpp/data/test_device_adapter.cu
@@ -51,3 +51,22 @@ void TestCudfAdapter()
 TEST(DeviceAdapter, CudfAdapter) {
   TestCudfAdapter();
 }
+
+namespace xgboost::data {
+TEST(DeviceAdapter, GetRowCounts) {
+  auto ctx = MakeCUDACtx(0);
+
+  for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
+    HostDeviceVector<float> storage;
+    auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
+                       .Device(ctx.gpu_id)
+                       .GenerateArrayInterface(&storage);
+    auto adapter = CupyAdapter{str_arr};
+    HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
+    offset.SetDevice(ctx.gpu_id);
+    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
+                                std::numeric_limits<float>::quiet_NaN());
+    ASSERT_EQ(rstride, n_features);
+  }
+}
+}  // namespace xgboost::data

From 62e9387cd5cd47122f8ab6aee839076cd2863ed3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 31 May 2023 03:00:44 +0800
Subject: [PATCH 16/19] [ci] Update PySpark version. (#9214)

---
 tests/ci_build/Dockerfile.gpu               | 2 +-
 tests/ci_build/conda_env/aarch64_test.yml   | 2 +-
 tests/ci_build/conda_env/linux_cpu_test.yml | 4 +---
 tests/ci_build/conda_env/macos_cpu_test.yml | 2 +-
 tests/ci_build/conda_env/python_lint.yml    | 4 +---
 5 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index 4011ad834..0822767c5 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -25,7 +25,7 @@ RUN \
         python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
         dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
         numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
-        pyspark cloudpickle cuda-python && \
+        pyspark>=3.4.0 cloudpickle cuda-python && \
     mamba clean --all && \
     conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
 
diff --git a/tests/ci_build/conda_env/aarch64_test.yml b/tests/ci_build/conda_env/aarch64_test.yml
index 42a2fe1e4..2af0324c9 100644
--- a/tests/ci_build/conda_env/aarch64_test.yml
+++ b/tests/ci_build/conda_env/aarch64_test.yml
@@ -28,7 +28,7 @@ dependencies:
 - llvmlite
 - cffi
 - pyarrow
-- pyspark
+- pyspark>=3.4.0
 - cloudpickle
 - pip:
   - awscli
diff --git a/tests/ci_build/conda_env/linux_cpu_test.yml b/tests/ci_build/conda_env/linux_cpu_test.yml
index bf657708d..d87d8fdef 100644
--- a/tests/ci_build/conda_env/linux_cpu_test.yml
+++ b/tests/ci_build/conda_env/linux_cpu_test.yml
@@ -38,8 +38,6 @@ dependencies:
 - protobuf
 - cloudpickle
 - modin
-# TODO: Replace it with pyspark>=3.4 once 3.4 released.
-# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
-- pyspark>=3.3.1
+- pyspark>=3.4.0
 - pip:
   - datatable
diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/tests/ci_build/conda_env/macos_cpu_test.yml
index 11d82ff7b..dfc1ee600 100644
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@@ -35,7 +35,7 @@ dependencies:
 - py-ubjson
 - cffi
 - pyarrow
-- pyspark
+- pyspark>=3.4.0
 - cloudpickle
 - pip:
   - sphinx_rtd_theme
diff --git a/tests/ci_build/conda_env/python_lint.yml b/tests/ci_build/conda_env/python_lint.yml
index 3d42dfaf3..dc5105a84 100644
--- a/tests/ci_build/conda_env/python_lint.yml
+++ b/tests/ci_build/conda_env/python_lint.yml
@@ -19,6 +19,4 @@ dependencies:
 - pytest
 - hypothesis
 - hatchling
-- pip:
-  # TODO: Replace it with pyspark>=3.4 once 3.4 released.
-  - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
+- pyspark>=3.4.0

From 7f20eaed93d1844f76a68898143a496e5d11b070 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 31 May 2023 05:00:02 +0800
Subject: [PATCH 17/19] [doc] Troubleshoot nccl shared memory. [skip ci]
 (#9206)

---
 doc/tutorials/dask.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 888683975..3562015e2 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -519,6 +519,9 @@ Troubleshooting
   the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
   logs.
 
+- If NCCL fails to initialize in a container environment, it might be caused by limited
+  system shared memory. With docker, one can try the flag: `--shm-size=4g`.
+
 - MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
   that includes `Multiple processes within a communication group ...` upon initialization.
 

From aba4559c4f88841b7a2ded1f76a2691fb4010f08 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 31 May 2023 05:01:02 +0800
Subject: [PATCH 18/19] [doc] Update dask demo. (#9201)

---
 demo/dask/gpu_training.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py
index 23cbfb47c..1a75f6c70 100644
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -38,19 +38,18 @@ def using_dask_matrix(client: Client, X, y):
 
 
 def using_quantile_device_dmatrix(client: Client, X, y):
-    """`DaskQuantileDMatrix` is a data type specialized for `gpu_hist`, tree
-     method that reduces memory overhead.  When training on GPU pipeline, it's
-     preferred over `DaskDMatrix`.
+    """`DaskQuantileDMatrix` is a data type specialized for `gpu_hist` and `hist` tree
+     methods for reducing memory usage.
 
     .. versionadded:: 1.2.0
 
     """
-    # Input must be on GPU for `DaskQuantileDMatrix`.
     X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
     y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))
 
-    # `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful
-    # that it can not be used for anything else other than training.
+    # `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not
+    # be used for anything else other than training unless a reference is specified. See
+    # the `ref` argument of `DaskQuantileDMatrix`.
     dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
     output = xgb.dask.train(
         client, {"verbosity": 2, "tree_method": "gpu_hist"}, dtrain, num_boost_round=4

From fa2ab1f0218e86fc31ba3216e212d1eb43003c84 Mon Sep 17 00:00:00 2001
From: ZHAOKAI WANG <sanxianwei@163.com>
Date: Wed, 31 May 2023 20:27:27 +0800
Subject: [PATCH 19/19] TreeRefresher note word spelling modification (#9223)

---
 src/tree/updater_refresh.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 448492de0..2bfd3c8de 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -20,7 +20,7 @@ namespace xgboost::tree {
 
 DMLC_REGISTRY_FILE_TAG(updater_refresh);
 
-/*! \brief pruner that prunes a tree after growing finishs */
+/*! \brief pruner that prunes a tree after growing finishes */
 class TreeRefresher : public TreeUpdater {
  public:
   explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}