sync Jun 1

2023-06-01 15:55:06 -07:00
parent c5b575e00e fa2ab1f021
commit 42867a4805
76 changed files with 1424 additions and 595 deletions
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -3,6 +3,7 @@
 */
 #pragma once
 #include <string>
+#include <vector>

 #include "communicator.h"

@@ -224,5 +225,46 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
  Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }

+template <typename T>
+struct AllgatherVResult {
+  std::vector<std::size_t> offsets;
+  std::vector<std::size_t> sizes;
+  std::vector<T> result;
+};
+
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * We assume each worker has the same number of inputs, but each input may be of a different size.
+ *
+ * @param inputs All the inputs from the local worker.
+ * @param sizes  Sizes of each input.
+ */
+template <typename T>
+inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
+                                      std::vector<std::size_t> const &sizes) {
+  auto num_inputs = sizes.size();
+
+  // Gather the sizes across all workers.
+  std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
+  std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
+  collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
+
+  // Calculate input offsets (std::exclusive_scan).
+  std::vector<std::size_t> offsets(all_sizes.size());
+  for (std::size_t i = 1; i < offsets.size(); i++) {
+    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
+  }
+
+  // Gather all the inputs.
+  auto total_input_size = offsets.back() + all_sizes.back();
+  std::vector<T> all_inputs(total_input_size);
+  std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
+  // We cannot use allgather here, since each worker might have a different size.
+  Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
+
+  return {offsets, all_sizes, all_inputs};
+}
+
 }  // namespace collective
 }  // namespace xgboost
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -12,19 +12,22 @@
 namespace xgboost {
 namespace collective {

-thread_local int Communicator::device_ordinal_{-1};
 thread_local std::unique_ptr<DeviceCommunicator> Communicator::device_communicator_{};

 void Communicator::Finalize() {
  communicator_->Shutdown();
  communicator_.reset(new NoOpCommunicator());
-  device_ordinal_ = -1;
  device_communicator_.reset(nullptr);
 }

 DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
-  if (!device_communicator_ || device_ordinal_ != device_ordinal) {
-    device_ordinal_ = device_ordinal;
+  thread_local auto old_device_ordinal = -1;
+  // If the number of GPUs changes, we need to re-initialize NCCL.
+  thread_local auto old_world_size = -1;
+  if (!device_communicator_ || device_ordinal != old_device_ordinal ||
+      communicator_->GetWorldSize() != old_world_size) {
+    old_device_ordinal = device_ordinal;
+    old_world_size = communicator_->GetWorldSize();
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
    if (type_ != CommunicatorType::kFederated) {
      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -229,7 +229,6 @@ class Communicator {
  static thread_local std::unique_ptr<Communicator> communicator_;
  static thread_local CommunicatorType type_;
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  static thread_local int device_ordinal_;
  static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif

--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023, XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_CUDA_CONTEXT_CUH_
 #define XGBOOST_COMMON_CUDA_CONTEXT_CUH_
@@ -16,21 +16,39 @@ struct CUDAContext {
  /**
   * \brief Caching thrust policy.
   */
-#if defined(XGBOOST_USE_HIP)
-  auto CTP() const { return thrust::hip::par(caching_alloc_).on(dh::DefaultStream()); }
+  auto CTP() const {
+#if defined(XGBOOST_USE_CUDA)
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
 #else
-  auto CTP() const { return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream()); }
+    return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
+#elif defined(XGBOOST_USE_HIP)
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::hip::par_nosync(caching_alloc_).on(dh::DefaultStream());
+#else
+    return thrust::hip::par(caching_alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
 #endif
-
+  }
  /**
   * \brief Thrust policy without caching allocator.
   */
-#if defined(XGBOOST_USE_HIP)
-  auto TP() const { return thrust::hip::par(alloc_).on(dh::DefaultStream()); }
+  auto TP() const {
+#if defined(XGBOOST_USE_CUDA)
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::cuda::par_nosync(alloc_).on(dh::DefaultStream());
 #else
-  auto TP() const { return thrust::cuda::par(alloc_).on(dh::DefaultStream()); }
+    return thrust::cuda::par(alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
+#elif defined(XGBOOST_USE_HIP)
+#if THRUST_MAJOR_VERSION >= 2
+    return thrust::hip::par_nosync(alloc_).on(dh::DefaultStream());
+#else
+    return thrust::hip::par(alloc_).on(dh::DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2
 #endif
-
+  }
  auto Stream() const { return dh::DefaultStream(); }
 };
 }  // namespace xgboost
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -227,9 +227,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
        return {0, e.index, e.fvalue};  // row_idx is not needed for scanning column size.
      });
  detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
-                             batch_it, dummy_is_valid,
-                             0, sorted_entries.size(),
-                             &cuts_ptr, &column_sizes_scan);
+                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
+                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();

  if (sketch_container->HasCategorical()) {
@@ -296,9 +295,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
      });
  detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
-                             batch_it, dummy_is_valid,
-                             0, sorted_entries.size(),
-                             &cuts_ptr, &column_sizes_scan);
+                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
+                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  if (sketch_container->HasCategorical()) {
    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -17,6 +17,10 @@
 #include "quantile.cuh"
 #include "timer.h"

+#if defined(XGBOOST_USE_HIP)
+namespace cub = hipcub;
+#endif
+
 namespace xgboost {
 namespace common {
 namespace cuda {
@@ -53,24 +57,128 @@ struct EntryCompareOp {
 };

 // Get column size from adapter batch and for output cuts.
-template <typename Iter>
-void GetColumnSizesScan(int device, size_t num_columns, size_t num_cuts_per_feature,
-                        Iter batch_iter, data::IsValidFunctor is_valid,
-                        size_t begin, size_t end,
-                        HostDeviceVector<SketchContainer::OffsetT> *cuts_ptr,
+template <std::uint32_t kBlockThreads, typename CounterT, typename BatchIt>
+__global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
+                                             data::IsValidFunctor is_valid,
+                                             Span<std::size_t> out_column_size) {
+  extern __shared__ char smem[];
+
+  auto smem_cs_ptr = reinterpret_cast<CounterT*>(smem);
+
+  dh::BlockFill(smem_cs_ptr, out_column_size.size(), 0);
+
+  cub::CTA_SYNC();
+
+  auto n = batch_iter.size();
+
+  for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n)) {
+    auto e = batch_iter[idx];
+    if (is_valid(e)) {
+      atomicAdd(&smem_cs_ptr[e.column_idx], static_cast<CounterT>(1));
+    }
+  }
+
+  cub::CTA_SYNC();
+
+  auto out_global_ptr = out_column_size;
+  for (auto i : dh::BlockStrideRange(static_cast<std::size_t>(0), out_column_size.size())) {
+    atomicAdd(&out_global_ptr[i], static_cast<std::size_t>(smem_cs_ptr[i]));
+  }
+}
+
+template <std::uint32_t kBlockThreads, typename Kernel>
+std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+  int n_mps = 0;
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
+#endif
+  int n_blocks_per_mp = 0;
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+                                                              kBlockThreads, shared_mem));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+                                                              kBlockThreads, shared_mem));
+#endif
+  std::uint32_t grid_size = n_blocks_per_mp * n_mps;
+  return grid_size;
+}
+
+/**
+ * \brief Get the size of each column. This is a histogram with additional handling of
+ *        invalid values.
+ *
+ * \tparam BatchIt                 Type of input adapter batch.
+ * \tparam force_use_global_memory Used for testing. Force global atomic add.
+ * \tparam force_use_u64           Used for testing. For u64 as counter in shared memory.
+ *
+ * \param device     CUDA device ordinal.
+ * \param batch_iter Iterator for input data from adapter batch.
+ * \param is_valid   Whehter an element is considered as missing.
+ * \param out_column_size Output buffer for the size of each column.
+ */
+template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
+void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+                               data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
+  thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
+
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  // Not strictly correct as we should use number of samples to determine the type of
+  // counter. However, the sample size is not known due to sliding window on number of
+  // elements.
+  std::size_t n = batch_iter.size();
+
+  std::size_t required_shared_memory = 0;
+  bool use_u32{false};
+  if (!force_use_u64 && n < static_cast<std::size_t>(std::numeric_limits<std::uint32_t>::max())) {
+    required_shared_memory = out_column_size.size() * sizeof(std::uint32_t);
+    use_u32 = true;
+  } else {
+    required_shared_memory = out_column_size.size() * sizeof(std::size_t);
+    use_u32 = false;
+  }
+  bool use_shared = required_shared_memory <= max_shared_memory && required_shared_memory != 0;
+
+  if (!force_use_global_memory && use_shared) {
+    CHECK_NE(required_shared_memory, 0);
+    std::uint32_t constexpr kBlockThreads = 512;
+    if (use_u32) {
+      CHECK(!force_use_u64);
+      auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
+      auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+          kernel, batch_iter, is_valid, out_column_size);
+    } else {
+      auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
+      auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+          kernel, batch_iter, is_valid, out_column_size);
+    }
+  } else {
+    auto d_out_column_size = out_column_size;
+    dh::LaunchN(batch_iter.size(), [=] __device__(size_t idx) {
+      auto e = batch_iter[idx];
+      if (is_valid(e)) {
+        atomicAdd(&d_out_column_size[e.column_idx], static_cast<size_t>(1));
+      }
+    });
+  }
+}
+
+template <typename BatchIt>
+void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+                        IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
+                        HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                        dh::caching_device_vector<size_t>* column_sizes_scan) {
-  column_sizes_scan->resize(num_columns + 1, 0);
+  column_sizes_scan->resize(num_columns + 1);
  cuts_ptr->SetDevice(device);
  cuts_ptr->Resize(num_columns + 1, 0);

  dh::XGBCachingDeviceAllocator<char> alloc;
-  auto d_column_sizes_scan = column_sizes_scan->data().get();
-  dh::LaunchN(end - begin, [=] __device__(size_t idx) {
-    auto e = batch_iter[begin + idx];
-    if (is_valid(e)) {
-      atomicAdd(&d_column_sizes_scan[e.column_idx], static_cast<size_t>(1));
-    }
-  });
+  auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan);
+  LaunchGetColumnSizeKernel(device, batch_iter, is_valid, d_column_sizes_scan);
  // Calculate cuts CSC pointer
  auto cut_ptr_it = dh::MakeTransformIterator<size_t>(
      column_sizes_scan->begin(), [=] __device__(size_t column_size) {
@@ -85,8 +193,7 @@ void GetColumnSizesScan(int device, size_t num_columns, size_t num_cuts_per_feat
                         column_sizes_scan->end(), column_sizes_scan->begin());
 #elif defined(XGBOOST_USE_CUDA)
  thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
-                         cut_ptr_it + column_sizes_scan->size(),
-                         cuts_ptr->DevicePointer());
+                         cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
  thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
                         column_sizes_scan->end(), column_sizes_scan->begin());
 #endif
@@ -130,29 +237,26 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,

 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
-void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
-                            Range1d range, float missing,
-                            size_t columns, size_t cuts_per_feature, int device,
+void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
+                            float missing, size_t columns, size_t cuts_per_feature, int device,
                            HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                            dh::caching_device_vector<size_t>* column_sizes_scan,
                            dh::device_vector<Entry>* sorted_entries) {
  auto entry_iter = dh::MakeTransformIterator<Entry>(
      thrust::make_counting_iterator(0llu), [=] __device__(size_t idx) {
-        return Entry(batch.GetElement(idx).column_idx,
-                     batch.GetElement(idx).value);
+        return Entry(batch.GetElement(idx).column_idx, batch.GetElement(idx).value);
      });
+  auto n = range.end() - range.begin();
+  auto span = IterSpan{batch_iter + range.begin(), n};
  data::IsValidFunctor is_valid(missing);
  // Work out how many valid entries we have in each column
-  GetColumnSizesScan(device, columns, cuts_per_feature,
-                     batch_iter, is_valid,
-                     range.begin(), range.end(),
-                     cut_sizes_scan,
+  GetColumnSizesScan(device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
                     column_sizes_scan);
  size_t num_valid = column_sizes_scan->back();
  // Copy current subset of valid elements into temporary storage and sort
  sorted_entries->resize(num_valid);
-  dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(),
-             sorted_entries->begin(), is_valid);
+  dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
+             is_valid);
 }

 void SortByWeight(dh::device_vector<float>* weights,
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -209,7 +209,7 @@ class PartitionBuilder {
                BitVector* decision_bits, BitVector* missing_bits) {
    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
+    bst_feature_t fid = tree.SplitIndex(nid);
    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
    auto node_cats = tree.NodeCats(nid);
    auto const& cut_values = gmat.cut.Values();
@@ -263,14 +263,13 @@ class PartitionBuilder {
  template <typename ExpandEntry>
  void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                       const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
-                       const size_t* rid, BitVector const& decision_bits,
+                       const RegTree& tree, const size_t* rid, BitVector const& decision_bits,
                       BitVector const& missing_bits) {
    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
    std::size_t nid = nodes[node_in_set].nid;
-    bool default_left = tree[nid].DefaultLeft();
+    bool default_left = tree.DefaultLeft(nid);

    auto pred = [&](auto ridx) {
      bool go_left = default_left;
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -7,7 +7,6 @@
 #include <utility>

 #include "../collective/aggregator.h"
-#include "../collective/communicator-inl.h"
 #include "../data/adapter.h"
 #include "categorical.h"
 #include "hist_util.h"
@@ -143,6 +142,7 @@ struct QuantileAllreduce {

 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
+    MetaInfo const& info,
    std::vector<typename WQSketch::SummaryContainer> const &reduced,
    std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
    std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -168,7 +168,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
  std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);

  // Gather all column pointers
-  collective::Allreduce<collective::Operation::kSum>(sketches_scan.data(), sketches_scan.size());
+  collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size());
  for (int32_t i = 0; i < world; ++i) {
    size_t back = (i + 1) * (n_columns + 1) - 1;
    auto n_entries = sketches_scan.at(back);
@@ -196,7 +196,8 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(

  static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
                "Unexpected size of sketch entry.");
-  collective::Allreduce<collective::Operation::kSum>(
+  collective::GlobalSum(
+      info,
      reinterpret_cast<float *>(global_sketches.data()),
      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
 }
@@ -222,8 +223,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
  std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
  size_t feat_begin = rank * feature_ptr.size();  // pointer to current worker
  std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
-  collective::Allreduce<collective::Operation::kSum>(global_feat_ptrs.data(),
-                                                     global_feat_ptrs.size());
+  collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size());

  // move all categories into a flatten vector to prepare for allreduce
  size_t total = feature_ptr.back();
@@ -236,8 +236,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
  // indptr for indexing workers
  std::vector<size_t> global_worker_ptr(world_size + 1, 0);
  global_worker_ptr[rank + 1] = total;  // shift 1 to right for constructing the indptr
-  collective::Allreduce<collective::Operation::kSum>(global_worker_ptr.data(),
-                                                     global_worker_ptr.size());
+  collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size());
  std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
  // total number of categories in all workers with all features
  auto gtotal = global_worker_ptr.back();
@@ -249,8 +248,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
  CHECK_EQ(rank_size, total);
  std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
  // gather values from all workers.
-  collective::Allreduce<collective::Operation::kSum>(global_categories.data(),
-                                                     global_categories.size());
+  collective::GlobalSum(info, global_categories.data(), global_categories.size());
  QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
                                            categories_.size()};
  ParallelFor(categories_.size(), n_threads_, [&](auto fidx) {
@@ -323,7 +321,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
  std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);

  std::vector<typename WQSketch::Entry> global_sketches;
-  this->GatherSketchInfo(reduced, &worker_segments, &sketches_scan, &global_sketches);
+  this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches);

  std::vector<typename WQSketch::SummaryContainer> final_sketches(n_columns);

@@ -371,7 +369,9 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
    InvalidCategory();
  }
  auto &cut_values = cuts->cut_values_.HostVector();
-  auto max_cat = *std::max_element(categories.cbegin(), categories.cend());
+  // With column-wise data split, the categories may be empty.
+  auto max_cat =
+      categories.empty() ? 0.0f : *std::max_element(categories.cbegin(), categories.cend());
  CheckMaxCat(max_cat, categories.size());
  for (bst_cat_t i = 0; i <= AsCat(max_cat); ++i) {
    cut_values.push_back(i);
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -822,7 +822,8 @@ class SketchContainerImpl {
    return group_ind;
  }
  // Gather sketches from all workers.
-  void GatherSketchInfo(std::vector<typename WQSketch::SummaryContainer> const &reduced,
+  void GatherSketchInfo(MetaInfo const& info,
+                        std::vector<typename WQSketch::SummaryContainer> const &reduced,
                        std::vector<bst_row_t> *p_worker_segments,
                        std::vector<bst_row_t> *p_sketches_scan,
                        std::vector<typename WQSketch::Entry> *p_global_sketches);
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -26,6 +26,12 @@
 #include "xgboost/logging.h"
 #include "xgboost/span.h"

+#if defined(XGBOOST_USE_CUDA)
+#include "cuda_fp16.h"
+#elif defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_fp16.h>
+#endif
+
 namespace xgboost {
 // Common errors in parsing columnar format.
 struct ArrayInterfaceErrors {
@@ -304,12 +310,12 @@ class ArrayInterfaceHandler {
 template <typename T, typename E = void>
 struct ToDType;
 // float
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
 template <>
 struct ToDType<__half> {
  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
 };
-#endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
 template <>
 struct ToDType<float> {
  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
@@ -459,11 +465,11 @@ class ArrayInterface {
      CHECK(sizeof(long double) == 16) << error::NoF128();
      type = T::kF16;
    } else if (typestr[1] == 'f' && typestr[2] == '2') {
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
      type = T::kF2;
 #else
      LOG(FATAL) << "Half type is not supported.";
-#endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
    } else if (typestr[1] == 'f' && typestr[2] == '4') {
      type = T::kF4;
    } else if (typestr[1] == 'f' && typestr[2] == '8') {
@@ -490,20 +496,17 @@ class ArrayInterface {
    }
  }

-  XGBOOST_DEVICE size_t Shape(size_t i) const { return shape[i]; }
-  XGBOOST_DEVICE size_t Stride(size_t i) const { return strides[i]; }
+  [[nodiscard]] XGBOOST_DEVICE std::size_t Shape(size_t i) const { return shape[i]; }
+  [[nodiscard]] XGBOOST_DEVICE std::size_t Stride(size_t i) const { return strides[i]; }

  template <typename Fn>
  XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const {
    using T = ArrayInterfaceHandler::Type;
    switch (type) {
      case T::kF2: {
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
        return func(reinterpret_cast<__half const *>(data));
-#else
-        SPAN_CHECK(false);
-        return func(reinterpret_cast<float const *>(data));
-#endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#endif  // defined(XGBOOST_USE_CUDA) || || defined(__HIP_PLATFORM_AMD__)
      }
      case T::kF4:
        return func(reinterpret_cast<float const *>(data));
@@ -540,23 +543,23 @@ class ArrayInterface {
    return func(reinterpret_cast<uint64_t const *>(data));
  }

-  XGBOOST_DEVICE std::size_t ElementSize() const {
+  [[nodiscard]] XGBOOST_DEVICE std::size_t ElementSize() const {
    return this->DispatchCall([](auto *typed_data_ptr) {
      return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
    });
  }
-  XGBOOST_DEVICE std::size_t ElementAlignment() const {
+  [[nodiscard]] XGBOOST_DEVICE std::size_t ElementAlignment() const {
    return this->DispatchCall([](auto *typed_data_ptr) {
      return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
    });
  }

  template <typename T = float, typename... Index>
-  XGBOOST_DEVICE T operator()(Index &&...index) const {
+  XGBOOST_HOST_DEV_INLINE T operator()(Index &&...index) const {
    static_assert(sizeof...(index) <= D, "Invalid index.");
    return this->DispatchCall([=](auto const *p_values) -> T {
      std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
      // No operator defined for half -> size_t
      using Type = std::conditional_t<
          std::is_same<__half,
@@ -566,7 +569,7 @@ class ArrayInterface {
      return static_cast<T>(static_cast<Type>(p_values[offset]));
 #else
      return static_cast<T>(p_values[offset]);
-#endif
+#endif  // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
    });
  }

@@ -603,7 +606,7 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
  };
  switch (array.type) {
    case ArrayInterfaceHandler::kF2: {
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
      dispatch(__half{});
 #endif
      break;
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -698,6 +698,9 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
    this->feature_type_names = that.feature_type_names;
    auto &h_feature_types = feature_types.HostVector();
    LoadFeatureType(this->feature_type_names, &h_feature_types);
+  } else if (!that.feature_types.Empty()) {
+    this->feature_types.Resize(that.feature_types.Size());
+    this->feature_types.Copy(that.feature_types);
  }
  if (!that.feature_weights.Empty()) {
    this->feature_weights.Resize(that.feature_weights.Size());
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -29,7 +29,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
      : columns_(columns),
        num_rows_(num_rows) {}
  size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ COOTuple GetElement(size_t idx) const {
+  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
    size_t column_idx = idx % columns_.size();
    size_t row_idx = idx / columns_.size();
    auto const& column = columns_[column_idx];
@@ -39,6 +39,14 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
    return {row_idx, column_idx, value};
  }

+  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+    auto const& column = columns_[fidx];
+    float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
+                      ? column(ridx)
+                      : std::numeric_limits<float>::quiet_NaN();
+    return value;
+  }
+
  XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
  XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }

@@ -166,6 +174,10 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
    float value = array_interface_(row_idx, column_idx);
    return {row_idx, column_idx, value};
  }
+  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+    float value = array_interface_(ridx, fidx);
+    return value;
+  }

  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
@@ -202,40 +214,64 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {

 // Returns maximum row length
 template <typename AdapterBatchT>
-size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
-                    int device_idx, float missing) {
-
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_idx));
-#elif defined(XGBOOST_USE_CUDA)
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+                         float missing) {
+#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipSetDevice(device_idx));
 #endif

  IsValidFunctor is_valid(missing);
+#if defined(XGBOOST_USE_CUDA)
+  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
+#endif
+
+  auto n_samples = batch.NumRows();
+  bst_feature_t n_features = batch.NumCols();
+
+  // Use more than 1 threads for each row in case of dataset being too wide.
+  bst_feature_t stride{0};
+  if (n_features < 32) {
+    stride = std::min(n_features, 4u);
+  } else if (n_features < 64) {
+    stride = 8;
+  } else if (n_features < 128) {
+    stride = 16;
+  } else {
+    stride = 32;
+  }
+
  // Count elements per row
-  dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
-    auto element = batch.GetElement(idx);
-    if (is_valid(element)) {
-      atomicAdd(reinterpret_cast<unsigned long long*>(  // NOLINT
-                    &offset[element.row_idx]),
-                static_cast<unsigned long long>(1));  // NOLINT
+  dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
+    bst_row_t cnt{0};
+    auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
+    SPAN_CHECK(ridx < n_samples);
+    for (bst_feature_t fidx = fbeg; fidx < n_features; fidx += stride) {
+      if (is_valid(batch.GetElement(ridx, fidx))) {
+        cnt++;
+      }
    }
+
+    atomicAdd(reinterpret_cast<unsigned long long*>(  // NOLINT
+                  &offset[ridx]),
+              static_cast<unsigned long long>(cnt));  // NOLINT
  });

  dh::XGBCachingDeviceAllocator<char> alloc;
-
-#if defined(XGBOOST_USE_HIP)
-  size_t row_stride =
-      dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
-                 thrust::device_pointer_cast(offset.data()) + offset.size(),
-                 static_cast<std::size_t>(0), thrust::maximum<size_t>());
-#elif defined(XGBOOST_USE_CUDA)
-  size_t row_stride =
+#if defined(XGBOOST_USE_CUDA)
+  bst_row_t row_stride =
      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
                 thrust::device_pointer_cast(offset.data()) + offset.size(),
-                 static_cast<std::size_t>(0), thrust::maximum<size_t>());
+                 static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
+#elif defined(XGBOOST_USE_HIP)
+  bst_row_t row_stride =
+      dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
+                 thrust::device_pointer_cast(offset.data()) + offset.size(),
+                 static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
 #endif
-
  return row_stride;
 }

@@ -243,13 +279,29 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
 * \brief Check there's no inf in data.
 */
 template <typename AdapterBatchT>
-bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
+bool NoInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
  auto counting = thrust::make_counting_iterator(0llu);
-  auto value_iter = dh::MakeTransformIterator<float>(
-      counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
-  auto valid =
-      thrust::none_of(value_iter, value_iter + batch.Size(),
-                      [is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
+  auto value_iter = dh::MakeTransformIterator<bool>(counting, [=] XGBOOST_DEVICE(std::size_t idx) {
+    auto v = batch.GetElement(idx).value;
+    if (!is_valid(v)) {
+      // discard the invalid elements.
+      return true;
+    }
+    // check that there's no inf in data.
+    return !std::isinf(v);
+  });
+  dh::XGBCachingDeviceAllocator<char> alloc;
+  // The default implementation in thrust optimizes any_of/none_of/all_of by using small
+  // intervals to early stop. But we expect all data to be valid here, using small
+  // intervals only decreases performance due to excessive kernel launch and stream
+  // synchronization.
+#if defined(XGBOOST_USE_CUDA)
+  auto valid = dh::Reduce(thrust::cuda::par(alloc), value_iter, value_iter + batch.Size(), true,
+                          thrust::logical_and<>{});
+#elif defined(XGBOOST_USE_HIP)
+  auto valid = dh::Reduce(thrust::hip::par(alloc), value_iter, value_iter + batch.Size(), true,
+                          thrust::logical_and<>{});
+#endif
  return valid;
 }
 };  // namespace data
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -213,7 +213,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
  // correct output position
  auto counting = thrust::make_counting_iterator(0llu);
  data::IsValidFunctor is_valid(missing);
-  bool valid = data::HasInfInData(batch, is_valid);
+  bool valid = data::NoInfInData(batch, is_valid);
  CHECK(valid) << error::InfInData();

  auto key_iter = dh::MakeTransformIterator<size_t>(
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -92,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    }
    auto batch_rows = num_rows();
    accumulated_rows += batch_rows;
-    dh::caching_device_vector<size_t> row_counts(batch_rows + 1, 0);
+    dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
    row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
                            return GetRowCounts(value, row_counts_span, get_device(), missing);
@@ -163,7 +163,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 #endif

    auto rows = num_rows();
-    dh::caching_device_vector<size_t> row_counts(rows + 1, 0);
+    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
    Dispatch(proxy, [=](auto const& value) {
      return GetRowCounts(value, row_counts_span, get_device(), missing);
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -92,7 +92,7 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
 template <typename AdapterBatchT>
 size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
                        SparsePage* page) {
-  bool valid = HasInfInData(batch, IsValidFunctor{missing});
+  bool valid = NoInfInData(batch, IsValidFunctor{missing});
  CHECK(valid) << error::InfInData();

  page->offset.SetDevice(device);
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -67,7 +67,7 @@ class ColumnSplitHelper {
      const int32_t nid = nodes[node_in_set].nid;
      const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
      partition_builder_->AllocateForTask(task_id);
-      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
+      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
                                          (*row_set_collection_)[nid].begin, decision_bits_,
                                          missing_bits_);
    });
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -25,7 +25,6 @@
 #include "xgboost/linalg.h"            // for Constants, Vector

 namespace xgboost::tree {
-template <typename ExpandEntry>
 class HistEvaluator {
 private:
  struct NodeEntry {
@@ -285,10 +284,42 @@ class HistEvaluator {
    return left_sum;
  }

+  /**
+   * @brief Gather the expand entries from all the workers.
+   * @param entries Local expand entries on this worker.
+   * @return Global expand entries gathered from all workers.
+   */
+  std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
+    auto const world = collective::GetWorldSize();
+    auto const rank = collective::GetRank();
+    auto const num_entries = entries.size();
+
+    // First, gather all the primitive fields.
+    std::vector<CPUExpandEntry> all_entries(num_entries * world);
+    std::vector<uint32_t> cat_bits;
+    std::vector<std::size_t> cat_bits_sizes;
+    for (std::size_t i = 0; i < num_entries; i++) {
+      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
+    }
+    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
+
+    // Gather all the cat_bits.
+    auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
+
+    common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
+      // Copy the cat_bits back into all expand entries.
+      all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
+      std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
+                  all_entries[i].split.cat_bits.begin());
+    });
+
+    return all_entries;
+  }
+
 public:
  void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
                      common::Span<FeatureType const> feature_types, const RegTree &tree,
-                      std::vector<ExpandEntry> *p_entries) {
+                      std::vector<CPUExpandEntry> *p_entries) {
    auto n_threads = ctx_->Threads();
    auto& entries = *p_entries;
    // All nodes are on the same level, so we can store the shared ptr.
@@ -306,7 +337,7 @@ class HistEvaluator {
      return features[nidx_in_set]->Size();
    }, grain_size);

-    std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
+    std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
    for (size_t i = 0; i < entries.size(); ++i) {
      for (decltype(n_threads) j = 0; j < n_threads; ++j) {
        tloc_candidates[i * n_threads + j] = entries[i];
@@ -365,22 +396,18 @@ class HistEvaluator {
    if (is_col_split_) {
      // With column-wise data split, we gather the best splits from all the workers and update the
      // expand entries accordingly.
-      auto const world = collective::GetWorldSize();
-      auto const rank = collective::GetRank();
-      auto const num_entries = entries.size();
-      std::vector<ExpandEntry> buffer{num_entries * world};
-      std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
-      collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
-      for (auto worker = 0; worker < world; ++worker) {
+      auto all_entries = Allgather(entries);
+      for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
-          entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
+          entries[nidx_in_set].split.Update(
+              all_entries[worker * entries.size() + nidx_in_set].split);
        }
      }
    }
  }

  // Add splits to tree, handles all statistic
-  void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
+  void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
    auto evaluator = tree_evaluator_.GetEvaluator();
    RegTree &tree = *p_tree;

@@ -465,6 +492,7 @@ class HistMultiEvaluator {
  FeatureInteractionConstraintHost interaction_constraints_;
  std::shared_ptr<common::ColumnSampler> column_sampler_;
  Context const *ctx_;
+  bool is_col_split_{false};

 private:
  static double MultiCalcSplitGain(TrainParam const &param,
@@ -543,6 +571,57 @@ class HistMultiEvaluator {
    return false;
  }

+  /**
+   * @brief Gather the expand entries from all the workers.
+   * @param entries Local expand entries on this worker.
+   * @return Global expand entries gathered from all workers.
+   */
+  std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
+    auto const world = collective::GetWorldSize();
+    auto const rank = collective::GetRank();
+    auto const num_entries = entries.size();
+
+    // First, gather all the primitive fields.
+    std::vector<MultiExpandEntry> all_entries(num_entries * world);
+    std::vector<uint32_t> cat_bits;
+    std::vector<std::size_t> cat_bits_sizes;
+    std::vector<GradientPairPrecise> gradients;
+    for (std::size_t i = 0; i < num_entries; i++) {
+      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
+                                                         &gradients);
+    }
+    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
+
+    // Gather all the cat_bits.
+    auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
+
+    // Gather all the gradients.
+    auto const num_gradients = gradients.size();
+    std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
+    std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
+    collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
+
+    auto const total_entries = num_entries * world;
+    auto const gradients_per_entry = num_gradients / num_entries;
+    auto const gradients_per_side = gradients_per_entry / 2;
+    common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
+      // Copy the cat_bits back into all expand entries.
+      all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
+      std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
+                  gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
+
+      // Copy the gradients back into all expand entries.
+      all_entries[i].split.left_sum.resize(gradients_per_side);
+      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
+                  all_entries[i].split.left_sum.begin());
+      all_entries[i].split.right_sum.resize(gradients_per_side);
+      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
+                  gradients_per_side, all_entries[i].split.right_sum.begin());
+    });
+
+    return all_entries;
+  }
+
 public:
  void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
                      common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@@ -597,6 +676,18 @@ class HistMultiEvaluator {
        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
      }
    }
+
+    if (is_col_split_) {
+      // With column-wise data split, we gather the best splits from all the workers and update the
+      // expand entries accordingly.
+      auto all_entries = Allgather(entries);
+      for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
+        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+          entries[nidx_in_set].split.Update(
+              all_entries[worker * entries.size() + nidx_in_set].split);
+        }
+      }
+    }
  }

  linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
@@ -660,7 +751,10 @@ class HistMultiEvaluator {

  explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
                              std::shared_ptr<common::ColumnSampler> sampler)
-      : param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
+      : param_{param},
+        column_sampler_{std::move(sampler)},
+        ctx_{ctx},
+        is_col_split_{info.IsColumnSplit()} {
    interaction_constraints_.Configure(*param, info.num_col_);
    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
                          param_->colsample_bynode, param_->colsample_bylevel,
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -70,6 +70,22 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
    os << "split:\n" << e.split << std::endl;
    return os;
  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits into a vector.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   */
+  void CopyAndCollect(CPUExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
+                      std::vector<std::size_t>* cat_bits_sizes) {
+    nid = that.nid;
+    depth = that.depth;
+    split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes);
+  }
 };

 struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
@@ -119,6 +135,24 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
    os << "]\n";
    return os;
  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits and gradients into vectors.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   * @param collected_gradients The vector to collect gradients
+   */
+  void CopyAndCollect(MultiExpandEntry const& that, std::vector<uint32_t>* collected_cat_bits,
+                      std::vector<std::size_t>* cat_bits_sizes,
+                      std::vector<GradientPairPrecise>* collected_gradients) {
+    nid = that.nid;
+    depth = that.depth;
+    split.CopyAndCollect(that.split, collected_cat_bits, cat_bits_sizes, collected_gradients);
+  }
 };
 }  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -419,6 +419,60 @@ struct SplitEntryContainer {
       << "right_sum: " << s.right_sum << std::endl;
    return os;
  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits into a vector.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   */
+  void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
+                      std::vector<uint32_t> *collected_cat_bits,
+                      std::vector<std::size_t> *cat_bits_sizes) {
+    loss_chg = that.loss_chg;
+    sindex = that.sindex;
+    split_value = that.split_value;
+    is_cat = that.is_cat;
+    static_assert(std::is_trivially_copyable_v<GradientT>);
+    left_sum = that.left_sum;
+    right_sum = that.right_sum;
+    collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
+                               that.cat_bits.cend());
+    cat_bits_sizes->emplace_back(that.cat_bits.size());
+  }
+
+  /**
+   * @brief Copy primitive fields into this, and collect cat_bits and gradient sums into vectors.
+   *
+   * This is used for allgather.
+   *
+   * @param that The other entry to copy from
+   * @param collected_cat_bits The vector to collect cat_bits
+   * @param cat_bits_sizes The sizes of the collected cat_bits
+   * @param collected_gradients The vector to collect gradients
+   */
+  template <typename G>
+  void CopyAndCollect(SplitEntryContainer<GradientT> const &that,
+                      std::vector<uint32_t> *collected_cat_bits,
+                      std::vector<std::size_t> *cat_bits_sizes,
+                      std::vector<G> *collected_gradients) {
+    loss_chg = that.loss_chg;
+    sindex = that.sindex;
+    split_value = that.split_value;
+    is_cat = that.is_cat;
+    collected_cat_bits->insert(collected_cat_bits->end(), that.cat_bits.cbegin(),
+                               that.cat_bits.cend());
+    cat_bits_sizes->emplace_back(that.cat_bits.size());
+    static_assert(!std::is_trivially_copyable_v<GradientT>);
+    collected_gradients->insert(collected_gradients->end(), that.left_sum.cbegin(),
+                                that.left_sum.cend());
+    collected_gradients->insert(collected_gradients->end(), that.right_sum.cbegin(),
+                                that.right_sum.cend());
+  }
+
  /*!\return feature index to split on */
  [[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
  /*!\return whether missing value goes to left branch */
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -44,7 +44,7 @@ class GloablApproxBuilder {
 protected:
  TrainParam const *param_;
  std::shared_ptr<common::ColumnSampler> col_sampler_;
-  HistEvaluator<CPUExpandEntry> evaluator_;
+  HistEvaluator evaluator_;
  HistogramBuilder<CPUExpandEntry> histogram_builder_;
  Context const *ctx_;
  ObjInfo const *const task_;
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -13,6 +13,7 @@
 #include <utility>                           // for move, swap
 #include <vector>                            // for vector

+#include "../collective/aggregator.h"        // for GlobalSum
 #include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
 #include "../collective/communicator.h"      // for Operation
 #include "../common/hist_util.h"             // for HistogramCuts, HistCollection
@@ -200,8 +201,8 @@ class MultiTargetHistBuilder {
      }
    }
    CHECK(root_sum.CContiguous());
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
+    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
+                          root_sum.Size() * 2);

    std::vector<MultiExpandEntry> nodes{best};
    std::size_t i = 0;
@@ -335,7 +336,7 @@ class HistBuilder {
  common::Monitor *monitor_;
  TrainParam const *param_;
  std::shared_ptr<common::ColumnSampler> col_sampler_;
-  std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
+  std::unique_ptr<HistEvaluator> evaluator_;
  std::vector<CommonRowPartitioner> partitioner_;

  // back pointers to tree and data matrix
@@ -354,7 +355,7 @@ class HistBuilder {
      : monitor_{monitor},
        param_{param},
        col_sampler_{std::move(column_sampler)},
-        evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
+        evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
                                                                   col_sampler_)},
        p_last_fmat_(fmat),
        histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
@@ -395,8 +396,7 @@ class HistBuilder {
    }
    histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
                              collective::IsDistributed(), fmat->Info().IsColumnSplit());
-    evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
-                                                                 col_sampler_);
+    evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
    p_last_tree_ = p_tree;
    monitor_->Stop(__func__);
  }
@@ -455,8 +455,7 @@ class HistBuilder {
        for (auto const &grad : gpair_h) {
          grad_stat.Add(grad.GetGrad(), grad.GetHess());
        }
-        collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
-                                                           2);
+        collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
      }

      auto weight = evaluator_->InitRoot(GradStats{grad_stat});
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -20,7 +20,7 @@ namespace xgboost::tree {

 DMLC_REGISTRY_FILE_TAG(updater_refresh);

-/*! \brief pruner that prunes a tree after growing finishs */
+/*! \brief pruner that prunes a tree after growing finishes */
 class TreeRefresher : public TreeUpdater {
 public:
  explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}