merge latest changes

2024-03-12 09:13:09 -07:00
parent 44db1cef54 1450aebb74
commit 968dbf25fb
174 changed files with 5276 additions and 2304 deletions
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -1,22 +1,21 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
 *
 * Higher level functions built on top the Communicator API, taking care of behavioral differences
 * between row-split vs column-split distributed training, and horizontal vs vertical federated
 * learning.
 */
 #pragma once
-#include <xgboost/data.h>
-
 #include <limits>
 #include <string>
 #include <utility>
 #include <vector>

 #include "communicator-inl.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/data.h"               // for MetaINfo

-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {

 /**
 * @brief Apply the given function where the labels are.
@@ -31,15 +30,16 @@ namespace collective {
 * @param size The size of the buffer.
 * @param function The function used to calculate the results.
 */
-template <typename Function>
-void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function) {
+template <typename FN>
+void ApplyWithLabels(Context const*, MetaInfo const& info, void* buffer, std::size_t size,
+                     FN&& function) {
  if (info.IsVerticalFederated()) {
    // We assume labels are only available on worker 0, so the calculation is done there and result
    // broadcast to other workers.
    std::string message;
    if (collective::GetRank() == 0) {
      try {
-        std::forward<Function>(function)();
+        std::forward<FN>(function)();
      } catch (dmlc::Error& e) {
        message = e.what();
      }
@@ -52,7 +52,7 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
      LOG(FATAL) << &message[0];
    }
  } else {
-    std::forward<Function>(function)();
+    std::forward<FN>(function)();
  }
 }

@@ -70,7 +70,8 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
 * @param function The function used to calculate the results.
 */
 template <typename T, typename Function>
-void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function&& function) {
+void ApplyWithLabels(Context const*, MetaInfo const& info, HostDeviceVector<T>* result,
+                     Function&& function) {
  if (info.IsVerticalFederated()) {
    // We assume labels are only available on worker 0, so the calculation is done there and result
    // broadcast to other workers.
@@ -114,7 +115,9 @@ void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function
 * @return The global max of the input.
 */
 template <typename T>
-T GlobalMax(MetaInfo const& info, T value) {
+std::enable_if_t<std::is_trivially_copy_assignable_v<T>, T> GlobalMax(Context const*,
+                                                                      MetaInfo const& info,
+                                                                      T value) {
  if (info.IsRowSplit()) {
    collective::Allreduce<collective::Operation::kMax>(&value, 1);
  }
@@ -132,16 +135,18 @@ T GlobalMax(MetaInfo const& info, T value) {
 * @param values Pointer to the inputs to sum.
 * @param size Number of values to sum.
 */
-template <typename T>
-void GlobalSum(MetaInfo const& info, T* values, size_t size) {
+template <typename T, std::int32_t kDim>
+[[nodiscard]] Result GlobalSum(Context const*, MetaInfo const& info,
+                               linalg::TensorView<T, kDim> values) {
  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(values, size);
+    collective::Allreduce<collective::Operation::kSum>(values.Values().data(), values.Size());
  }
+  return Success();
 }

 template <typename Container>
-void GlobalSum(MetaInfo const& info, Container* values) {
-  GlobalSum(info, values->data(), values->size());
+[[nodiscard]] Result GlobalSum(Context const* ctx, MetaInfo const& info, Container* values) {
+  return GlobalSum(ctx, info, values->data(), values->size());
 }

 /**
@@ -157,9 +162,10 @@ void GlobalSum(MetaInfo const& info, Container* values) {
 * @return The global ratio of the two inputs.
 */
 template <typename T>
-T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
+T GlobalRatio(Context const* ctx, MetaInfo const& info, T dividend, T divisor) {
  std::array<T, 2> results{dividend, divisor};
-  GlobalSum(info, &results);
+  auto rc = GlobalSum(ctx, info, linalg::MakeVec(results.data(), results.size()));
+  collective::SafeColl(rc);
  std::tie(dividend, divisor) = std::tuple_cat(results);
  if (divisor <= 0) {
    return std::numeric_limits<T>::quiet_NaN();
@@ -167,6 +173,4 @@ T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
    return dividend / divisor;
  }
 }
-
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
--- a/src/collective/communicator-inl.cc
+++ b/src/collective/communicator-inl.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include "communicator-inl.h"
+
+namespace xgboost::collective {
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input) {
+  auto n_inputs = input.size();
+  std::vector<std::int64_t> sizes(n_inputs);
+  std::transform(input.cbegin(), input.cend(), sizes.begin(),
+                 [](auto const &vec) { return vec.size(); });
+
+  std::vector<std::int64_t> global_sizes = AllgatherV(sizes);
+  std::vector<std::int64_t> offset(global_sizes.size() + 1);
+  offset[0] = 0;
+  for (std::size_t i = 1; i < offset.size(); i++) {
+    offset[i] = offset[i - 1] + global_sizes[i - 1];
+  }
+
+  std::vector<char> collected;
+  for (auto const &vec : input) {
+    collected.insert(collected.end(), vec.cbegin(), vec.cend());
+  }
+  auto out = AllgatherV(collected);
+
+  std::vector<std::vector<char>> result;
+  for (std::size_t i = 1; i < offset.size(); ++i) {
+    std::vector<char> local(out.cbegin() + offset[i - 1], out.cbegin() + offset[i]);
+    result.emplace_back(std::move(local));
+  }
+  return result;
+}
+}  // namespace xgboost::collective
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost contributors
+ * Copyright 2022-2024, XGBoost contributors
 */
 #pragma once
 #include <string>
@@ -192,6 +192,18 @@ inline std::vector<T> AllgatherV(std::vector<T> const &input) {
  return result;
 }

+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * @param inputs All the inputs from the local worker. The number of inputs can vary
+ *               across different workers. Along with which, the size of each vector in
+ *               the input can also vary.
+ *
+ * @return The AllgatherV result, containing vectors from all workers.
+ */
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input);
+
 /**
 * @brief Gathers variable-length strings from all processes and distributes them to all processes.
 * @param input Variable-length list of variable-length strings.
@@ -294,38 +306,5 @@ template <Operation op>
 inline void Allreduce(double *send_receive_buffer, size_t count) {
  Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }
-
-template <typename T>
-struct SpecialAllgatherVResult {
-  std::vector<std::size_t> offsets;
-  std::vector<std::size_t> sizes;
-  std::vector<T> result;
-};
-
-/**
- * @brief Gathers variable-length data from all processes and distributes it to all processes.
- *
- * We assume each worker has the same number of inputs, but each input may be of a different size.
- *
- * @param inputs All the inputs from the local worker.
- * @param sizes  Sizes of each input.
- */
-template <typename T>
-inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
-                                                    std::vector<std::size_t> const &sizes) {
-  // Gather the sizes across all workers.
-  auto const all_sizes = Allgather(sizes);
-
-  // Calculate input offsets (std::exclusive_scan).
-  std::vector<std::size_t> offsets(all_sizes.size());
-  for (std::size_t i = 1; i < offsets.size(); i++) {
-    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
-  }
-
-  // Gather all the inputs.
-  auto const all_inputs = AllgatherV(inputs);
-
-  return {offsets, all_sizes, all_inputs};
-}
 }  // namespace collective
 }  // namespace xgboost
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -1,11 +1,12 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
 */
 #include "xgboost/json.h"

 #include <array>             // for array
 #include <cctype>            // for isdigit
 #include <cmath>             // for isinf, isnan
+#include <cstdint>           // for uint8_t, uint16_t, uint32_t
 #include <cstdio>            // for EOF
 #include <cstdlib>           // for size_t, strtof
 #include <cstring>           // for memcpy
@@ -72,15 +73,16 @@ void JsonWriter::Visit(JsonNumber const* num) {
 }

 void JsonWriter::Visit(JsonInteger const* num) {
-  char i2s_buffer_[NumericLimits<int64_t>::kToCharsSize];
+  std::array<char, NumericLimits<int64_t>::kToCharsSize> i2s_buffer_;
  auto i = num->GetInteger();
-  auto ret = to_chars(i2s_buffer_, i2s_buffer_ + NumericLimits<int64_t>::kToCharsSize, i);
+  auto ret =
+      to_chars(i2s_buffer_.data(), i2s_buffer_.data() + NumericLimits<int64_t>::kToCharsSize, i);
  auto end = ret.ptr;
  CHECK(ret.ec == std::errc());
-  auto digits = std::distance(i2s_buffer_, end);
+  auto digits = std::distance(i2s_buffer_.data(), end);
  auto ori_size = stream_->size();
  stream_->resize(ori_size + digits);
-  std::memcpy(stream_->data() + ori_size, i2s_buffer_, digits);
+  std::memcpy(stream_->data() + ori_size, i2s_buffer_.data(), digits);
 }

 void JsonWriter::Visit(JsonNull const* ) {
@@ -143,8 +145,10 @@ std::string Value::TypeStr() const {
      return "Null";
    case ValueKind::kInteger:
      return "Integer";
-    case ValueKind::kNumberArray:
+    case ValueKind::kF32Array:
      return "F32Array";
+    case ValueKind::kF64Array:
+      return "F64Array";
    case ValueKind::kU8Array:
      return "U8Array";
    case ValueKind::kI32Array:
@@ -262,10 +266,11 @@ bool JsonTypedArray<T, kind>::operator==(Value const& rhs) const {
  return std::equal(arr.cbegin(), arr.cend(), vec_.cbegin());
 }

-template class JsonTypedArray<float, Value::ValueKind::kNumberArray>;
-template class JsonTypedArray<uint8_t, Value::ValueKind::kU8Array>;
-template class JsonTypedArray<int32_t, Value::ValueKind::kI32Array>;
-template class JsonTypedArray<int64_t, Value::ValueKind::kI64Array>;
+template class JsonTypedArray<float, Value::ValueKind::kF32Array>;
+template class JsonTypedArray<double, Value::ValueKind::kF64Array>;
+template class JsonTypedArray<std::uint8_t, Value::ValueKind::kU8Array>;
+template class JsonTypedArray<std::int32_t, Value::ValueKind::kI32Array>;
+template class JsonTypedArray<std::int64_t, Value::ValueKind::kI64Array>;

 // Json Number
 bool JsonNumber::operator==(Value const& rhs) const {
@@ -708,6 +713,8 @@ Json UBJReader::ParseArray() {
    switch (type) {
      case 'd':
        return ParseTypedArray<F32Array>(n);
+      case 'D':
+        return ParseTypedArray<F64Array>(n);
      case 'U':
        return ParseTypedArray<U8Array>(n);
      case 'l':
@@ -791,12 +798,16 @@ Json UBJReader::Parse() {
        return Json{JsonBoolean{true}};
      }
      case 'F': {
-        return Json{JsonBoolean{true}};
+        return Json{JsonBoolean{false}};
      }
      case 'd': {
        auto v = this->ReadPrimitive<float>();
        return Json{v};
      }
+      case 'D': {
+        auto v = this->ReadPrimitive<double>();
+        return Json{v};
+      }
      case 'S': {
        auto str = this->DecodeStr();
        return Json{str};
@@ -825,10 +836,6 @@ Json UBJReader::Parse() {
        Integer::Int i = this->ReadPrimitive<char>();
        return Json{i};
      }
-      case 'D': {
-        LOG(FATAL) << "f64 is not supported.";
-        break;
-      }
      case 'H': {
        LOG(FATAL) << "High precision number is not supported.";
        break;
@@ -882,6 +889,8 @@ void WriteTypedArray(JsonTypedArray<T, kind> const* arr, std::vector<char>* stre
  stream->push_back('$');
  if (std::is_same<T, float>::value) {
    stream->push_back('d');
+  } else if (std::is_same_v<T, double>) {
+    stream->push_back('D');
  } else if (std::is_same<T, int8_t>::value) {
    stream->push_back('i');
  } else if (std::is_same<T, uint8_t>::value) {
@@ -910,6 +919,7 @@ void WriteTypedArray(JsonTypedArray<T, kind> const* arr, std::vector<char>* stre
 }

 void UBJWriter::Visit(F32Array const* arr) { WriteTypedArray(arr, stream_); }
+void UBJWriter::Visit(F64Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(U8Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(I32Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(I64Array const* arr) { WriteTypedArray(arr, stream_); }
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -13,15 +13,14 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/linalg.h"   // for TensorView

-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace cuda_impl {
 // Use template specialization to dispatch, Windows + CUDA 11.8 doesn't support extended
 // lambda inside constexpr if
 template <typename T, std::int32_t D>
 struct ElementWiseImpl {
  template <typename Fn>
-  void operator()(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
+  void operator()(TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
    static_assert(D > 1);
    dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) mutable {
      std::apply(fn, linalg::UnravelIndex(i, t.Shape()));
@@ -32,37 +31,59 @@ struct ElementWiseImpl {
 template <typename T>
 struct ElementWiseImpl<T, 1> {
  template <typename Fn>
-  void operator()(linalg::TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
+  void operator()(TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
    dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) { fn(i); });
  }
 };

 template <typename T, std::int32_t D, typename Fn>
-void ElementWiseKernel(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+void ElementWiseKernel(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
  dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
  cuda_impl::ElementWiseImpl<T, D>{}(t, fn, s);
 }
 }  // namespace cuda_impl

 template <typename T, int32_t D, typename Fn>
-void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
-{
+void ElementWiseTransformDevice(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
  if (t.Contiguous()) {
    auto ptr = t.Values().data();
    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
  } else {
    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
-      T& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
+      T& v = detail::Apply(t, UnravelIndex(i, t.Shape()));
      v = fn(i, v);
    });
  }
 }

 template <typename T, int32_t D, typename Fn>
-void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
+void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
  ctx->IsCUDA() ? cuda_impl::ElementWiseKernel(t, fn)
                : ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
-}  // namespace linalg
-}  // namespace xgboost
+
+namespace detail {
+template <typename T, std::int32_t kDim>
+struct IterOp {
+  TensorView<T, kDim> v;
+  XGBOOST_DEVICE T& operator()(std::size_t i) {
+    return detail::Apply(v, UnravelIndex(i, v.Shape()));
+  }
+};
+}  // namespace detail
+
+// naming: thrust begin
+// returns a thrust iterator for a tensor view.
+template <typename T, std::int32_t kDim>
+auto tcbegin(TensorView<T, kDim> v) {  // NOLINT
+  return dh::MakeTransformIterator<T>(
+      thrust::make_counting_iterator(0ul),
+      detail::IterOp<std::add_const_t<std::remove_const_t<T>>, kDim>{v});
+}
+
+template <typename T, std::int32_t kDim>
+auto tcend(TensorView<T, kDim> v) {  // NOLINT
+  return tcbegin(v) + v.Size();
+}
+}  // namespace xgboost::linalg
 #endif  // XGBOOST_COMMON_LINALG_OP_CUH_
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2024, XGBoost Contributors
 */
 #include "quantile.h"

@@ -145,7 +145,7 @@ struct QuantileAllreduce {

 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
-    Context const *, MetaInfo const &info,
+    Context const *ctx, MetaInfo const &info,
    std::vector<typename WQSketch::SummaryContainer> const &reduced,
    std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
    std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -171,7 +171,9 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
  std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);

  // Gather all column pointers
-  collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size());
+  auto rc =
+      collective::GlobalSum(ctx, info, linalg::MakeVec(sketches_scan.data(), sketches_scan.size()));
+  collective::SafeColl(rc);
  for (int32_t i = 0; i < world; ++i) {
    size_t back = (i + 1) * (n_columns + 1) - 1;
    auto n_entries = sketches_scan.at(back);
@@ -199,14 +201,15 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(

  static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
                "Unexpected size of sketch entry.");
-  collective::GlobalSum(
-      info,
-      reinterpret_cast<float *>(global_sketches.data()),
-      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
+  rc = collective::GlobalSum(
+      ctx, info,
+      linalg::MakeVec(reinterpret_cast<float *>(global_sketches.data()),
+                      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float)));
+  collective::SafeColl(rc);
 }

 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo const& info) {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const* ctx, MetaInfo const& info) {
  auto world_size = collective::GetWorldSize();
  auto rank = collective::GetRank();
  if (world_size == 1 || info.IsColumnSplit()) {
@@ -226,7 +229,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
  std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
  size_t feat_begin = rank * feature_ptr.size();  // pointer to current worker
  std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
-  collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size());
+  auto rc = collective::GlobalSum(
+      ctx, info, linalg::MakeVec(global_feat_ptrs.data(), global_feat_ptrs.size()));

  // move all categories into a flatten vector to prepare for allreduce
  size_t total = feature_ptr.back();
@@ -239,7 +243,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
  // indptr for indexing workers
  std::vector<size_t> global_worker_ptr(world_size + 1, 0);
  global_worker_ptr[rank + 1] = total;  // shift 1 to right for constructing the indptr
-  collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size());
+  rc = collective::GlobalSum(ctx, info,
+                             linalg::MakeVec(global_worker_ptr.data(), global_worker_ptr.size()));
  std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
  // total number of categories in all workers with all features
  auto gtotal = global_worker_ptr.back();
@@ -251,7 +256,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
  CHECK_EQ(rank_size, total);
  std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
  // gather values from all workers.
-  collective::GlobalSum(info, global_categories.data(), global_categories.size());
+  rc = collective::GlobalSum(ctx, info,
+                             linalg::MakeVec(global_categories.data(), global_categories.size()));
  QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
                                            categories_.size()};
  ParallelFor(categories_.size(), n_threads_, [&](auto fidx) {
@@ -293,7 +299,9 @@ void SketchContainerImpl<WQSketch>::AllReduce(

  // Prune the intermediate num cuts for synchronization.
  std::vector<bst_row_t> global_column_size(columns_size_);
-  collective::GlobalSum(info, &global_column_size);
+  auto rc = collective::GlobalSum(
+      ctx, info, linalg::MakeVec(global_column_size.data(), global_column_size.size()));
+  collective::SafeColl(rc);

  ParallelFor(sketches_.size(), n_threads_, [&](size_t i) {
    int32_t intermediate_num_cuts = static_cast<int32_t>(
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -31,7 +31,7 @@ namespace xgboost::common {
 */
 using RandomEngine = std::mt19937;

-#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+#if defined(XGBOOST_CUSTOMIZE_GLOBAL_PRNG) && XGBOOST_CUSTOMIZE_GLOBAL_PRNG == 1
 /*!
 * \brief An customized random engine, used to be plugged in PRNG from other systems.
 *  The implementation of this library is not provided by xgboost core library.
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -78,6 +78,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {

  // unbiased
  bool lambdarank_unbiased{false};
+  bool lambdarank_normalization{true};
  double lambdarank_bias_norm{1.0};
  // ndcg
  bool ndcg_exp_gain{true};
@@ -86,6 +87,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
    return lambdarank_pair_method == that.lambdarank_pair_method &&
           lambdarank_num_pair_per_sample == that.lambdarank_num_pair_per_sample &&
           lambdarank_unbiased == that.lambdarank_unbiased &&
+           lambdarank_normalization == that.lambdarank_normalization &&
           lambdarank_bias_norm == that.lambdarank_bias_norm && ndcg_exp_gain == that.ndcg_exp_gain;
  }
  bool operator!=(LambdaRankParam const& that) const { return !(*this == that); }
@@ -134,6 +136,9 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
    DMLC_DECLARE_FIELD(lambdarank_unbiased)
        .set_default(false)
        .describe("Unbiased lambda mart. Use extended IPW to debias click position");
+    DMLC_DECLARE_FIELD(lambdarank_normalization)
+        .set_default(true)
+        .describe("Whether to normalize the leaf value for lambda rank.");
    DMLC_DECLARE_FIELD(lambdarank_bias_norm)
        .set_default(1.0)
        .set_lower_bound(0.0)
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -106,30 +106,13 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
  Validate(*this);
 }

-namespace {
-std::int32_t IOThreads(Context const* ctx) {
-  CHECK(ctx);
-  std::int32_t n_threads = ctx->Threads();
-  // CRAN checks for number of threads used by examples, but we might not have the right
-  // number of threads when serializing/unserializing models as nthread is a booster
-  // parameter, which is only effective after booster initialization.
-  //
-  // The threshold ratio of CPU time to user time for R is 2.5, we set the number of
-  // threads to 2.
-#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  n_threads = std::min(2, n_threads);
-#endif
-  return n_threads;
-}
-}  // namespace
-
 void GBTreeModel::SaveModel(Json* p_out) const {
  auto& out = *p_out;
  CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
  out["gbtree_model_param"] = ToJson(param);
  std::vector<Json> trees_json(trees.size());

-  common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
+  common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
    auto const& tree = trees[t];
    Json jtree{Object{}};
    tree->SaveModel(&jtree);
@@ -167,7 +150,7 @@ void GBTreeModel::LoadModel(Json const& in) {
  CHECK_EQ(tree_info_json.size(), param.num_trees);
  tree_info.resize(param.num_trees);

-  common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
+  common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
    auto tree_id = get<Integer const>(trees_json[t]["id"]);
    trees.at(tree_id).reset(new RegTree{});
    trees[tree_id]->LoadModel(trees_json[t]);
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
 * \file learner.cc
 * \brief Implementation of learning algorithm.
 * \author Tianqi Chen
@@ -846,7 +846,7 @@ class LearnerConfiguration : public Learner {

  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
    base_score->Reshape(1);
-    collective::ApplyWithLabels(info, base_score->Data(),
+    collective::ApplyWithLabels(this->Ctx(), info, base_score->Data(),
                                [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
  }
 };
@@ -1472,7 +1472,7 @@ class LearnerImpl : public LearnerIO {
  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
                   std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
    out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
-    collective::ApplyWithLabels(info, out_gpair->Data(),
+    collective::ApplyWithLabels(&ctx_, info, out_gpair->Data(),
                                [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
  }

--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
 */
 #include "auc.h"

@@ -112,7 +112,9 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI

  // we have 2 averages going in here, first is among workers, second is among
  // classes. allreduce sums up fp/tp auc for each class.
-  collective::GlobalSum(info, &results.Values());
+  auto rc = collective::GlobalSum(ctx, info, results);
+  collective::SafeColl(rc);
+
  double auc_sum{0};
  double tp_sum{0};
  for (size_t c = 0; c < n_classes; ++c) {
@@ -286,7 +288,7 @@ class EvalAUC : public MetricNoCache {
        InvalidGroupAUC();
      }

-      auc = collective::GlobalRatio(info, auc, static_cast<double>(valid_groups));
+      auc = collective::GlobalRatio(ctx_, info, auc, static_cast<double>(valid_groups));
      if (!std::isnan(auc)) {
        CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups
                         << ", valid groups: " << valid_groups;
@@ -307,7 +309,7 @@ class EvalAUC : public MetricNoCache {
        std::tie(fp, tp, auc) =
            static_cast<Curve *>(this)->EvalBinary(preds, info);
      }
-      auc = collective::GlobalRatio(info, auc, fp * tp);
+      auc = collective::GlobalRatio(ctx_, info, auc, fp * tp);
      if (!std::isnan(auc)) {
        CHECK_LE(auc, 1.0);
      }
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
 * \file elementwise_metric.cu
 * \brief evaluation metrics for elementwise binary or regression.
 * \author Kailong Chen, Tianqi Chen
@@ -12,13 +12,14 @@
 #include <cmath>

 #include "../collective/communicator-inl.h"
-#include "../common/common.h"           // MetricNoCache
+#include "../common/common.h"  // MetricNoCache
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "../common/pseudo_huber.h"
 #include "../common/quantile_loss_utils.h"  // QuantileLossParam
 #include "../common/threading_utils.h"
 #include "metric_common.h"
+#include "xgboost/collective/result.h"  // for SafeColl
 #include "xgboost/metric.h"

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -30,8 +31,7 @@
 #include "../common/device_helpers.cuh"
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(elementwise_metric);

@@ -199,7 +199,8 @@ class PseudoErrorLoss : public MetricNoCache {
          return std::make_tuple(v, wt);
        });
    std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
    return EvalRowMAPE::GetFinal(dat[0], dat[1]);
  }
 };
@@ -243,11 +244,11 @@ struct EvalError {
 };

 struct EvalPoissonNegLogLik {
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
    return "poisson-nloglik";
  }

-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
    const bst_float eps = 1e-16f;
    if (py < eps) py = eps;
    return common::LogGamma(y + 1.0f) + py - std::log(py) * y;
@@ -266,9 +267,9 @@ struct EvalPoissonNegLogLik {
 *   predt >= 0
 */
 struct EvalGammaDeviance {
-  const char *Name() const { return "gamma-deviance"; }
+  [[nodiscard]] const char *Name() const { return "gamma-deviance"; }

-  XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const {
    predt += kRtEps;
    label += kRtEps;
    return std::log(predt / label) + label / predt - 1;
@@ -287,7 +288,7 @@ struct EvalGammaNLogLik {
    return "gamma-nloglik";
  }

-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
    py = std::max(py, 1e-6f);
    // hardcoded dispersion.
    float constexpr kPsi = 1.0;
@@ -313,7 +314,7 @@ struct EvalTweedieNLogLik {
    CHECK(rho_ < 2 && rho_ >= 1)
        << "tweedie variance power must be in interval [1, 2)";
  }
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
    static thread_local std::string name;
    std::ostringstream os;
    os << "tweedie-nloglik@" << rho_;
@@ -321,7 +322,7 @@ struct EvalTweedieNLogLik {
    return name.c_str();
  }

-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const {
    bst_float a = y * std::exp((1 - rho_) * std::log(p)) / (1 - rho_);
    bst_float b = std::exp((2 - rho_) * std::log(p)) / (2 - rho_);
    return -a + b;
@@ -366,7 +367,8 @@ struct EvalEWiseBase : public MetricNoCache {
        });

    std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
    return Policy::GetFinal(dat[0], dat[1]);
  }

@@ -438,7 +440,8 @@ class QuantileError : public MetricNoCache {
    if (info.num_row_ == 0) {
      // empty DMatrix on distributed env
      std::array<double, 2> dat{0.0, 0.0};
-      collective::GlobalSum(info, &dat);
+      auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+      collective::SafeColl(rc);
      CHECK_GT(dat[1], 0);
      return dat[0] / dat[1];
    }
@@ -476,7 +479,8 @@ class QuantileError : public MetricNoCache {
          return std::make_tuple(l, w);
        });
    std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
    CHECK_GT(dat[1], 0);
    return dat[0] / dat[1];
  }
@@ -501,5 +505,4 @@ class QuantileError : public MetricNoCache {
 XGBOOST_REGISTER_METRIC(QuantileError, "quantile")
    .describe("Quantile regression error.")
    .set_body([](const char*) { return new QuantileError{}; });
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -1,6 +1,5 @@
-/*!
- * Copyright 2018-2022 by Contributors
- * \file metric_common.h
+/**
+ * Copyright 2018-2024, Contributors
 */
 #ifndef XGBOOST_METRIC_METRIC_COMMON_H_
 #define XGBOOST_METRIC_METRIC_COMMON_H_
@@ -24,7 +23,7 @@ class MetricNoCache : public Metric {
  double Evaluate(HostDeviceVector<float> const &predts, std::shared_ptr<DMatrix> p_fmat) final {
    double result{0.0};
    auto const &info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double),
+    collective::ApplyWithLabels(ctx_, info, &result, sizeof(double),
                                [&] { result = this->Eval(predts, info); });
    return result;
  }
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
 * \file multiclass_metric.cc
 * \brief evaluation metrics for multiclass classification.
 * \author Kailong Chen, Tianqi Chen
@@ -24,8 +24,7 @@
 #include "../common/device_helpers.cuh"
 #endif  // XGBOOST_USE_CUDA || XGBOOST_USE_HIP

-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(multiclass_metric);

@@ -40,11 +39,10 @@ class MultiClassMetricsReduction {
 public:
  MultiClassMetricsReduction() = default;

-  PackedReduceResult
-  CpuReduceMetrics(const HostDeviceVector<bst_float> &weights,
-                   const HostDeviceVector<bst_float> &labels,
-                   const HostDeviceVector<bst_float> &preds,
-                   const size_t n_class, int32_t n_threads) const {
+  [[nodiscard]] PackedReduceResult CpuReduceMetrics(const HostDeviceVector<bst_float>& weights,
+                                                    const HostDeviceVector<bst_float>& labels,
+                                                    const HostDeviceVector<bst_float>& preds,
+                                                    const size_t n_class, int32_t n_threads) const {
    size_t ndata = labels.Size();

    const auto& h_labels = labels.HostVector();
@@ -184,7 +182,8 @@ struct EvalMClassBase : public MetricNoCache {
      dat[0] = result.Residue();
      dat[1] = result.Weights();
    }
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
    return Derived::GetFinal(dat[0], dat[1]);
  }
  /*!
@@ -247,5 +246,4 @@ XGBOOST_REGISTER_METRIC(MatchError, "merror")
 XGBOOST_REGISTER_METRIC(MultiLogLoss, "mlogloss")
    .describe("Multiclass negative loglikelihood.")
    .set_body([](const char*) { return new EvalMultiLogLoss(); });
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -101,7 +101,7 @@ struct EvalAMS : public MetricNoCache {
    }
  }

-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
    return name_.c_str();
  }

@@ -159,7 +159,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
      exc.Rethrow();
    }

-    return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
+    return collective::GlobalRatio(ctx_, info, sum_metric, static_cast<double>(ngroups));
  }

  [[nodiscard]] const char* Name() const override {
@@ -274,7 +274,7 @@ class EvalRankWithCache : public Metric {
  double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
    double result{0.0};
    auto const& info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+    collective::ApplyWithLabels(ctx_, info, &result, sizeof(double), [&] {
      auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
      if (p_cache->Param() != param_) {
        p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
@@ -294,9 +294,10 @@ class EvalRankWithCache : public Metric {
 };

 namespace {
-double Finalize(Context const*, MetaInfo const& info, double score, double sw) {
+double Finalize(Context const* ctx, MetaInfo const& info, double score, double sw) {
  std::array<double, 2> dat{score, sw};
-  collective::GlobalSum(info, &dat);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), 2));
+  collective::SafeColl(rc);
  std::tie(score, sw) = std::tuple_cat(dat);
  if (sw > 0.0) {
    score = score / sw;
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
 */
 #include <dmlc/registry.h>
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by Contributors
+ * Copyright 2019-2024, Contributors
 * \file survival_metric.cu
 * \brief Metrics for survival analysis
 * \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
@@ -30,8 +30,7 @@ using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType
 template <typename Distribution>
 using AFTLoss = xgboost::common::AFTLoss<Distribution>;

-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(survival_metric);

@@ -43,12 +42,11 @@ class ElementWiseSurvivalMetricsReduction {
    policy_ = policy;
  }

-  PackedReduceResult
-  CpuReduceMetrics(const HostDeviceVector<bst_float> &weights,
-                   const HostDeviceVector<bst_float> &labels_lower_bound,
-                   const HostDeviceVector<bst_float> &labels_upper_bound,
-                   const HostDeviceVector<bst_float> &preds,
-                   int32_t n_threads) const {
+  [[nodiscard]] PackedReduceResult CpuReduceMetrics(
+      const HostDeviceVector<bst_float>& weights,
+      const HostDeviceVector<bst_float>& labels_lower_bound,
+      const HostDeviceVector<bst_float>& labels_upper_bound,
+      const HostDeviceVector<bst_float>& preds, int32_t n_threads) const {
    size_t ndata = labels_lower_bound.Size();
    CHECK_EQ(ndata, labels_upper_bound.Size());

@@ -156,7 +154,7 @@ class ElementWiseSurvivalMetricsReduction {
 struct EvalIntervalRegressionAccuracy {
  void Configure(const Args&) {}

-  const char* Name() const {
+  [[nodiscard]] const char* Name() const {
    return "interval-regression-accuracy";
  }

@@ -178,7 +176,7 @@ struct EvalAFTNLogLik {
    param_.UpdateAllowUnknown(args);
  }

-  const char* Name() const {
+  [[nodiscard]] const char* Name() const {
    return "aft-nloglik";
  }

@@ -214,7 +212,8 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
                                  info.labels_upper_bound_, preds);

    std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
    return Policy::GetFinal(dat[0], dat[1]);
  }

@@ -231,7 +230,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
 // This class exists because we want to perform dispatch according to the distribution type at
 // configuration time, not at prediction time.
 struct AFTNLogLikDispatcher : public MetricNoCache {
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
    return "aft-nloglik";
  }

@@ -283,5 +282,4 @@ XGBOOST_REGISTER_METRIC(IntervalRegressionAccuracy, "interval-regression-accurac
      return new EvalEWiseSurvivalBase<EvalIntervalRegressionAccuracy>();
    });

-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
 */
 #include "adaptive.h"

@@ -85,7 +85,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
  size_t n_leaf = nidx.size();
  if (nptr.empty()) {
    std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
+    UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree);
    return;
  }

@@ -100,7 +100,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
                                        predt.Size() / info.num_row_);

  collective::ApplyWithLabels(
-      info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
+      ctx, info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
        // loop over each leaf
        common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
          auto nidx = h_node_idx[k];
@@ -134,7 +134,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
        });
      });

-  UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
+  UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree);
 }

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
 */
 #include <thrust/sort.h>

@@ -157,7 +157,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos

  if (nptr.Empty()) {
    std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
+    UpdateLeafValues(ctx, &quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
  }

  predt.SetDevice(ctx->Device());
@@ -167,7 +167,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
  auto t_predt = d_predt.Slice(linalg::All(), group_idx);

  HostDeviceVector<float> quantiles;
-  collective::ApplyWithLabels(info, &quantiles, [&] {
+  collective::ApplyWithLabels(ctx, info, &quantiles, [&] {
    auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
    auto d_row_index = dh::ToSpan(ridx);
    auto seg_beg = nptr.DevicePointer();
@@ -193,6 +193,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                        w_it + d_weights.size(), &quantiles);
    }
  });
-  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
+  UpdateLeafValues(ctx, &quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate,
+                   p_tree);
 }
 }  // namespace xgboost::obj::detail
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
 */
 #pragma once

@@ -17,8 +17,7 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/tree_model.h"          // RegTree

-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 namespace detail {
 inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
                            std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_nptr) {
@@ -36,13 +35,14 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
  }
 }

-inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
-                             MetaInfo const& info, float learning_rate, RegTree* p_tree) {
+inline void UpdateLeafValues(Context const* ctx, std::vector<float>* p_quantiles,
+                             std::vector<bst_node_t> const& nidx, MetaInfo const& info,
+                             float learning_rate, RegTree* p_tree) {
  auto& tree = *p_tree;
  auto& quantiles = *p_quantiles;
  auto const& h_node_idx = nidx;

-  size_t n_leaf = collective::GlobalMax(info, h_node_idx.size());
+  size_t n_leaf = collective::GlobalMax(ctx, info, h_node_idx.size());
  CHECK(quantiles.empty() || quantiles.size() == n_leaf);
  if (quantiles.empty()) {
    quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@@ -52,12 +52,16 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
  std::vector<int32_t> n_valids(quantiles.size());
  std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
                 [](float q) { return static_cast<int32_t>(!std::isnan(q)); });
-  collective::GlobalSum(info, &n_valids);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(n_valids.data(), n_valids.size()));
+  collective::SafeColl(rc);
+
  // convert to 0 for all reduce
  std::replace_if(
      quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
  // use the mean value
-  collective::GlobalSum(info, &quantiles);
+  rc = collective::GlobalSum(ctx, info, linalg::MakeVec(quantiles.data(), quantiles.size()));
+  collective::SafeColl(rc);
+
  for (size_t i = 0; i < n_leaf; ++i) {
    if (n_valids[i] > 0) {
      quantiles[i] /= static_cast<float>(n_valids[i]);
@@ -105,5 +109,4 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
                               predt, alpha, p_tree);
  }
 }
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -222,7 +222,7 @@ class LambdaRankObj : public FitIntercept {
    };

    MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
-    if (sum_lambda > 0.0) {
+    if (sum_lambda > 0.0 && param_.lambdarank_normalization) {
      double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
      std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
                     g_gpair.Values().data(), [norm](GradientPair const& g) { return g * norm; });
@@ -474,7 +474,6 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
 public:
  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
                       const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
-    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
    if (ctx_->IsCUDA()) {
      return cuda_impl::LambdaRankGetGradientMAP(
          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
@@ -564,7 +563,6 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
 public:
  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
                       const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
-    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
    if (ctx_->IsCUDA()) {
      return cuda_impl::LambdaRankGetGradientPairwise(
          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
@@ -610,6 +608,13 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
  [[nodiscard]] const char* DefaultEvalMetric() const override {
    return this->RankEvalMetric("ndcg");
  }
+
+  [[nodiscard]] Json DefaultMetricConfig() const override {
+    Json config{Object{}};
+    config["name"] = String{DefaultEvalMetric()};
+    config["lambdarank_param"] = ToJson(param_);
+    return config;
+  }
 };

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -270,12 +270,13 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
   */
  auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
  auto w_norm = p_cache->WeightNorm();
+  auto norm = p_cache->Param().lambdarank_normalization;
  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.Size(),
                     [=] XGBOOST_DEVICE(std::size_t i) mutable {
                       auto g = dh::SegmentId(d_gptr, i);
                       auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
                       // Normalization
-                       if (sum_lambda > 0.0) {
+                       if (sum_lambda > 0.0 && norm) {
                         double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
                         d_gpair(i, 0) *= norm;
                       }
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
 */
 #include <array>                            // std::array
 #include <cstddef>                          // std::size_t
@@ -170,7 +170,9 @@ class QuantileRegression : public ObjFunction {
    double meanq = temp(0) * sw;

    std::array<double, 2> dat{meanq, sw};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
+
    std::tie(meanq, sw) = std::tuple_cat(dat);
    meanq /= (sw + kRtEps);
    base_score->Reshape(1);
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
 * \file regression_obj.cu
 * \brief Definition of single-value regression and classification objectives.
 * \author Tianqi Chen, Kailong Chen
@@ -672,8 +672,12 @@ class MeanAbsoluteError : public ObjFunction {
    std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
                   [w](float v) { return v * w; });

-    collective::GlobalSum(info, &out.Values());
-    collective::GlobalSum(info, &w, 1);
+    auto rc = collective::Success() << [&] {
+      return collective::GlobalSum(ctx_, info, out);
+    } << [&] {
+      return collective::GlobalSum(ctx_, info, linalg::MakeVec(&w, 1));
+    };
+    collective::SafeColl(rc);

    if (common::CloseTo(w, 0.0)) {
      // Mostly for handling empty dataset test.
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -698,6 +698,67 @@ class CPUPredictor : public Predictor {
    }
  }

+  template <typename DataView>
+  void PredictContributionKernel(DataView batch, const MetaInfo& info,
+                                 const gbm::GBTreeModel& model,
+                                 const std::vector<bst_float>* tree_weights,
+                                 std::vector<std::vector<float>>* mean_values,
+                                 std::vector<RegTree::FVec>* feat_vecs,
+                                 std::vector<bst_float>* contribs, uint32_t ntree_limit,
+                                 bool approximate, int condition,
+                                 unsigned condition_feature) const {
+    const int num_feature = model.learner_model_param->num_feature;
+    const int ngroup = model.learner_model_param->num_output_group;
+    CHECK_NE(ngroup, 0);
+    size_t const ncolumns = num_feature + 1;
+    CHECK_NE(ncolumns, 0);
+    auto base_margin = info.base_margin_.View(ctx_->Device());
+    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
+
+    // parallel over local batch
+    common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
+      auto row_idx = batch.base_rowid + i;
+      RegTree::FVec &feats = (*feat_vecs)[omp_get_thread_num()];
+      if (feats.Size() == 0) {
+        feats.Init(num_feature);
+      }
+      std::vector<bst_float> this_tree_contribs(ncolumns);
+      // loop over all classes
+      for (int gid = 0; gid < ngroup; ++gid) {
+        bst_float* p_contribs = &(*contribs)[(row_idx * ngroup + gid) * ncolumns];
+        feats.Fill(batch[i]);
+        // calculate contributions
+        for (unsigned j = 0; j < ntree_limit; ++j) {
+          auto *tree_mean_values = &mean_values->at(j);
+          std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
+          if (model.tree_info[j] != gid) {
+            continue;
+          }
+          if (!approximate) {
+            CalculateContributions(*model.trees[j], feats, tree_mean_values,
+                                   &this_tree_contribs[0], condition, condition_feature);
+          } else {
+            model.trees[j]->CalculateContributionsApprox(
+                feats, tree_mean_values, &this_tree_contribs[0]);
+          }
+          for (size_t ci = 0; ci < ncolumns; ++ci) {
+            p_contribs[ci] +=
+                this_tree_contribs[ci] *
+                (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
+          }
+        }
+        feats.Drop();
+        // add base margin to BIAS
+        if (base_margin.Size() != 0) {
+          CHECK_EQ(base_margin.Shape(1), ngroup);
+          p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
+        } else {
+          p_contribs[ncolumns - 1] += base_score;
+        }
+      }
+    });
+  }
+
 public:
  explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}

@@ -861,7 +922,6 @@ class CPUPredictor : public Predictor {
    CHECK(!p_fmat->Info().IsColumnSplit())
        << "Predict contribution support for column-wise data split is not yet implemented.";
    auto const n_threads = this->ctx_->Threads();
-    const int num_feature = model.learner_model_param->num_feature;
    std::vector<RegTree::FVec> feat_vecs;
    InitThreadTemp(n_threads, &feat_vecs);
    const MetaInfo& info = p_fmat->Info();
@@ -869,10 +929,7 @@ class CPUPredictor : public Predictor {
    if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
      ntree_limit = static_cast<unsigned>(model.trees.size());
    }
-    const int ngroup = model.learner_model_param->num_output_group;
-    CHECK_NE(ngroup, 0);
-    size_t const ncolumns = num_feature + 1;
-    CHECK_NE(ncolumns, 0);
+    size_t const ncolumns = model.learner_model_param->num_feature + 1;
    // allocate space for (number of features + bias) times the number of rows
    std::vector<bst_float>& contribs = out_contribs->HostVector();
    contribs.resize(info.num_row_ * ncolumns * model.learner_model_param->num_output_group);
@@ -884,53 +941,22 @@ class CPUPredictor : public Predictor {
    common::ParallelFor(ntree_limit, n_threads, [&](bst_omp_uint i) {
      FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
    });
-    auto base_margin = info.base_margin_.View(ctx_->Device());
-    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
    // start collecting the contributions
-    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-      auto page = batch.GetView();
-      // parallel over local batch
-      common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
-        auto row_idx = batch.base_rowid + i;
-        RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
-        if (feats.Size() == 0) {
-          feats.Init(num_feature);
-        }
-        std::vector<bst_float> this_tree_contribs(ncolumns);
-        // loop over all classes
-        for (int gid = 0; gid < ngroup; ++gid) {
-          bst_float* p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
-          feats.Fill(page[i]);
-          // calculate contributions
-          for (unsigned j = 0; j < ntree_limit; ++j) {
-            auto *tree_mean_values = &mean_values.at(j);
-            std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
-            if (model.tree_info[j] != gid) {
-              continue;
-            }
-            if (!approximate) {
-              CalculateContributions(*model.trees[j], feats, tree_mean_values,
-                                     &this_tree_contribs[0], condition, condition_feature);
-            } else {
-              model.trees[j]->CalculateContributionsApprox(
-                  feats, tree_mean_values, &this_tree_contribs[0]);
-            }
-            for (size_t ci = 0; ci < ncolumns; ++ci) {
-              p_contribs[ci] +=
-                  this_tree_contribs[ci] *
-                  (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
-            }
-          }
-          feats.Drop();
-          // add base margin to BIAS
-          if (base_margin.Size() != 0) {
-            CHECK_EQ(base_margin.Shape(1), ngroup);
-            p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
-          } else {
-            p_contribs[ncolumns - 1] += base_score;
-          }
-        }
-      });
+    if (!p_fmat->PageExists<SparsePage>()) {
+      std::vector<Entry> workspace(info.num_col_ * kUnroll * n_threads);
+      auto ft = p_fmat->Info().feature_types.ConstHostVector();
+      for (const auto &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
+        PredictContributionKernel(
+            GHistIndexMatrixView{batch, info.num_col_, ft, workspace, n_threads},
+            info, model, tree_weights, &mean_values, &feat_vecs, &contribs, ntree_limit,
+            approximate, condition, condition_feature);
+      }
+    } else {
+      for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+        PredictContributionKernel(
+            SparsePageView{&batch}, info, model, tree_weights, &mean_values, &feat_vecs,
+            &contribs, ntree_limit, approximate, condition, condition_feature);
+      }
    }
  }

--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -1048,6 +1048,9 @@ class GPUPredictor : public xgboost::Predictor {
    if (tree_weights != nullptr) {
      LOG(FATAL) << "Dart booster feature " << not_implemented;
    }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
+    }
    CHECK(!p_fmat->Info().IsColumnSplit())
        << "Predict contribution support for column-wise data split is not yet implemented.";
    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
@@ -1108,6 +1111,9 @@ class GPUPredictor : public xgboost::Predictor {
    if (tree_weights != nullptr) {
      LOG(FATAL) << "Dart booster feature " << not_implemented;
    }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
+    }
    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    out_contribs->SetDevice(ctx_->Device());
    if (tree_end == 0 || tree_end > model.trees.size()) {
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -1,7 +1,7 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
 *
- * \brief Utilities for estimating initial score.
+ * @brief Utilities for estimating initial score.
 */
 #include "fit_stump.h"

@@ -44,8 +44,11 @@ void FitStump(Context const* ctx, MetaInfo const& info,
    }
  }
  CHECK(h_sum.CContiguous());
-
-  collective::GlobalSum(info, reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+  auto as_double = linalg::MakeTensorView(
+      ctx, common::Span{reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2},
+      h_sum.Size() * 2);
+  auto rc = collective::GlobalSum(ctx, info, as_double);
+  collective::SafeColl(rc);

  for (std::size_t i = 0; i < h_sum.Size(); ++i) {
    out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -1,19 +1,18 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
 *
- * \brief Utilities for estimating initial score.
+ * @brief Utilities for estimating initial score.
 */
 #if !defined(NOMINMAX) && defined(_WIN32)
 #define NOMINMAX
-#endif                                            // !defined(NOMINMAX)
-#include <thrust/execution_policy.h>              // cuda::par
-#include <thrust/iterator/counting_iterator.h>    // thrust::make_counting_iterator
+#endif                                          // !defined(NOMINMAX)
+#include <thrust/execution_policy.h>            // cuda::par
+#include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator

-#include <cstddef>                                // std::size_t
+#include <cstddef>  // std::size_t

-#include "../collective/aggregator.cuh"
-#include "../collective/communicator-inl.cuh"
-#include "../common/device_helpers.cuh"           // dh::MakeTransformIterator
+#include "../collective/aggregator.cuh"  // for GlobalSum
+#include "../common/device_helpers.cuh"  // dh::MakeTransformIterator
 #include "fit_stump.h"
 #include "xgboost/base.h"     // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
 #include "xgboost/context.h"  // Context
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
 */
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
@@ -52,7 +52,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
 *
 * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
 */
-GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
+GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair,
                                     MetaInfo const& info) {
  using GradientSumT = GradientPairPrecise;
  using T = typename GradientSumT::ValueT;
@@ -66,11 +66,14 @@ GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair c
  // Treat pair as array of 4 primitive types to allreduce
  using ReduceT = typename decltype(p.first)::ValueT;
  static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
-  collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(reinterpret_cast<ReduceT*>(&p), 4));
+  collective::SafeColl(rc);
+
  GradientPair positive_sum{p.first}, negative_sum{p.second};

  std::size_t total_rows = gpair.size();
-  collective::GlobalSum(info, &total_rows, 1);
+  rc = collective::GlobalSum(ctx, info, linalg::MakeVec(&total_rows, 1));
+  collective::SafeColl(rc);

  auto histogram_rounding =
      GradientSumT{common::CreateRoundingFactor<T>(
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
 */
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
@@ -26,6 +26,47 @@
 #include "xgboost/linalg.h"            // for Constants, Vector

 namespace xgboost::tree {
+/**
+ * @brief Gather the expand entries from all the workers.
+ * @param entries Local expand entries on this worker.
+ * @return Global expand entries gathered from all workers.
+ */
+template <typename ExpandEntry>
+std::enable_if_t<std::is_same_v<ExpandEntry, CPUExpandEntry> ||
+                     std::is_same_v<ExpandEntry, MultiExpandEntry>,
+                 std::vector<ExpandEntry>>
+AllgatherColumnSplit(std::vector<ExpandEntry> const &entries) {
+  auto const n_entries = entries.size();
+
+  // First, gather all the primitive fields.
+  std::vector<ExpandEntry> local_entries(n_entries);
+
+  // Collect and serialize all entries
+  std::vector<std::vector<char>> serialized_entries;
+  for (std::size_t i = 0; i < n_entries; ++i) {
+    Json jentry{Object{}};
+    entries[i].Save(&jentry);
+
+    std::vector<char> out;
+    Json::Dump(jentry, &out, std::ios::binary);
+
+    serialized_entries.emplace_back(std::move(out));
+  }
+  auto all_serialized = collective::VectorAllgatherV(serialized_entries);
+  CHECK_GE(all_serialized.size(), local_entries.size());
+
+  std::vector<ExpandEntry> all_entries(all_serialized.size());
+  std::transform(all_serialized.cbegin(), all_serialized.cend(), all_entries.begin(),
+                 [](std::vector<char> const &e) {
+                   ExpandEntry entry;
+                   auto je = Json::Load(StringView{e.data(), e.size()}, std::ios::binary);
+                   entry.Load(je);
+                   return entry;
+                 });
+
+  return all_entries;
+}
+
 class HistEvaluator {
 private:
  struct NodeEntry {
@@ -36,8 +77,8 @@ class HistEvaluator {
  };

 private:
-  Context const* ctx_;
-  TrainParam const* param_;
+  Context const *ctx_;
+  TrainParam const *param_;
  std::shared_ptr<common::ColumnSampler> column_sampler_;
  TreeEvaluator tree_evaluator_;
  bool is_col_split_{false};
@@ -202,7 +243,7 @@ class HistEvaluator {
      common::CatBitField cat_bits{best.cat_bits};
      bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin);
      CHECK_GT(partition, 0);
-      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) {
+      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](std::size_t c) {
        auto cat = cut_val[c + f_begin];
        cat_bits.Set(cat);
      });
@@ -285,57 +326,23 @@ class HistEvaluator {
    return left_sum;
  }

-  /**
-   * @brief Gather the expand entries from all the workers.
-   * @param entries Local expand entries on this worker.
-   * @return Global expand entries gathered from all workers.
-   */
-  std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
-    auto const world = collective::GetWorldSize();
-    auto const num_entries = entries.size();
-
-    // First, gather all the primitive fields.
-    std::vector<CPUExpandEntry> local_entries(num_entries);
-    std::vector<uint32_t> cat_bits;
-    std::vector<std::size_t> cat_bits_sizes;
-    for (std::size_t i = 0; i < num_entries; i++) {
-      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
-    }
-    auto all_entries = collective::Allgather(local_entries);
-
-    // Gather all the cat_bits.
-    auto gathered = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
-
-    common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
-      // Copy the cat_bits back into all expand entries.
-      all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
-      std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
-                  all_entries[i].split.cat_bits.begin());
-    });
-
-    return all_entries;
-  }
-
 public:
  void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
                      common::Span<FeatureType const> feature_types, const RegTree &tree,
                      std::vector<CPUExpandEntry> *p_entries) {
    auto n_threads = ctx_->Threads();
-    auto& entries = *p_entries;
+    auto &entries = *p_entries;
    // All nodes are on the same level, so we can store the shared ptr.
-    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(
-        entries.size());
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
    for (size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
      auto nidx = entries[nidx_in_set].nid;
-      features[nidx_in_set] =
-          column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
+      features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
    }
    CHECK(!features.empty());
-    const size_t grain_size =
-        std::max<size_t>(1, features.front()->Size() / n_threads);
-    common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) {
-      return features[nidx_in_set]->Size();
-    }, grain_size);
+    const size_t grain_size = std::max<size_t>(1, features.front()->Size() / n_threads);
+    common::BlockedSpace2d space(
+        entries.size(), [&](size_t nidx_in_set) { return features[nidx_in_set]->Size(); },
+        grain_size);

    std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
    for (size_t i = 0; i < entries.size(); ++i) {
@@ -344,7 +351,7 @@ class HistEvaluator {
      }
    }
    auto evaluator = tree_evaluator_.GetEvaluator();
-    auto const& cut_ptrs = cut.Ptrs();
+    auto const &cut_ptrs = cut.Ptrs();

    common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
      auto tidx = omp_get_thread_num();
@@ -385,18 +392,16 @@ class HistEvaluator {
      }
    });

-    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size();
-         ++nidx_in_set) {
+    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
      for (auto tidx = 0; tidx < n_threads; ++tidx) {
-        entries[nidx_in_set].split.Update(
-            tloc_candidates[n_threads * nidx_in_set + tidx].split);
+        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
      }
    }

    if (is_col_split_) {
      // With column-wise data split, we gather the best splits from all the workers and update the
      // expand entries accordingly.
-      auto all_entries = Allgather(entries);
+      auto all_entries = AllgatherColumnSplit(entries);
      for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
          entries[nidx_in_set].split.Update(
@@ -407,7 +412,7 @@ class HistEvaluator {
  }

  // Add splits to tree, handles all statistic
-  void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
+  void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) {
    auto evaluator = tree_evaluator_.GetEvaluator();
    RegTree &tree = *p_tree;

@@ -437,8 +442,7 @@ class HistEvaluator {
    auto left_child = tree[candidate.nid].LeftChild();
    auto right_child = tree[candidate.nid].RightChild();
    tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
-                             tree[candidate.nid].SplitIndex(), left_weight,
-                             right_weight);
+                             tree[candidate.nid].SplitIndex(), left_weight, right_weight);
    evaluator = tree_evaluator_.GetEvaluator();

    snode_.resize(tree.GetNodes().size());
@@ -449,8 +453,7 @@ class HistEvaluator {
    snode_.at(right_child).root_gain =
        evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.right_sum});

-    interaction_constraints_.Split(candidate.nid,
-                                   tree[candidate.nid].SplitIndex(), left_child,
+    interaction_constraints_.Split(candidate.nid, tree[candidate.nid].SplitIndex(), left_child,
                                   right_child);
  }

@@ -571,53 +574,6 @@ class HistMultiEvaluator {
    return false;
  }

-  /**
-   * @brief Gather the expand entries from all the workers.
-   * @param entries Local expand entries on this worker.
-   * @return Global expand entries gathered from all workers.
-   */
-  std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
-    auto const world = collective::GetWorldSize();
-    auto const num_entries = entries.size();
-
-    // First, gather all the primitive fields.
-    std::vector<MultiExpandEntry> local_entries(num_entries);
-    std::vector<uint32_t> cat_bits;
-    std::vector<std::size_t> cat_bits_sizes;
-    std::vector<GradientPairPrecise> gradients;
-    for (std::size_t i = 0; i < num_entries; i++) {
-      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes, &gradients);
-    }
-    auto all_entries = collective::Allgather(local_entries);
-
-    // Gather all the cat_bits.
-    auto gathered_cat_bits = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
-
-    // Gather all the gradients.
-    auto const num_gradients = gradients.size();
-    auto const all_gradients = collective::Allgather(gradients);
-
-    auto const total_entries = num_entries * world;
-    auto const gradients_per_entry = num_gradients / num_entries;
-    auto const gradients_per_side = gradients_per_entry / 2;
-    common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
-      // Copy the cat_bits back into all expand entries.
-      all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
-      std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
-                  gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
-
-      // Copy the gradients back into all expand entries.
-      all_entries[i].split.left_sum.resize(gradients_per_side);
-      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
-                  all_entries[i].split.left_sum.begin());
-      all_entries[i].split.right_sum.resize(gradients_per_side);
-      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
-                  gradients_per_side, all_entries[i].split.right_sum.begin());
-    });
-
-    return all_entries;
-  }
-
 public:
  void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
                      common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@@ -676,7 +632,7 @@ class HistMultiEvaluator {
    if (is_col_split_) {
      // With column-wise data split, we gather the best splits from all the workers and update the
      // expand entries accordingly.
-      auto all_entries = Allgather(entries);
+      auto all_entries = AllgatherColumnSplit(entries);
      for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
          entries[nidx_in_set].split.Update(
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -90,7 +90,6 @@ struct ExpandEntryImpl {
    }

    self->split.is_cat = get<Boolean const>(split["is_cat"]);
-
    self->LoadGrad(split);
  }
 };
@@ -106,8 +105,8 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
  void SaveGrad(Json* p_out) const {
    auto& out = *p_out;
    auto save = [&](std::string const& name, GradStats const& sum) {
-      out[name] = F32Array{2};
-      auto& array = get<F32Array>(out[name]);
+      out[name] = F64Array{2};
+      auto& array = get<F64Array>(out[name]);
      array[0] = sum.GetGrad();
      array[1] = sum.GetHess();
    };
@@ -115,9 +114,9 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
    save("right_sum", this->split.right_sum);
  }
  void LoadGrad(Json const& in) {
-    auto const& left_sum = get<F32Array const>(in["left_sum"]);
+    auto const& left_sum = get<F64Array const>(in["left_sum"]);
    this->split.left_sum = GradStats{left_sum[0], left_sum[1]};
-    auto const& right_sum = get<F32Array const>(in["right_sum"]);
+    auto const& right_sum = get<F64Array const>(in["right_sum"]);
    this->split.right_sum = GradStats{right_sum[0], right_sum[1]};
  }

@@ -173,8 +172,8 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
  void SaveGrad(Json* p_out) const {
    auto& out = *p_out;
    auto save = [&](std::string const& name, std::vector<GradientPairPrecise> const& sum) {
-      out[name] = F32Array{sum.size() * 2};
-      auto& array = get<F32Array>(out[name]);
+      out[name] = F64Array{sum.size() * 2};
+      auto& array = get<F64Array>(out[name]);
      for (std::size_t i = 0, j = 0; i < sum.size(); i++, j += 2) {
        array[j] = sum[i].GetGrad();
        array[j + 1] = sum[i].GetHess();
@@ -185,7 +184,7 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
  }
  void LoadGrad(Json const& in) {
    auto load = [&](std::string const& name, std::vector<GradientPairPrecise>* p_sum) {
-      auto const& array = get<F32Array const>(in[name]);
+      auto const& array = get<F64Array const>(in[name]);
      auto& sum = *p_sum;
      sum.resize(array.size() / 2);
      for (std::size_t i = 0, j = 0; i < sum.size(); ++i, j += 2) {
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023, XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
 * \file tree_model.cc
 * \brief model structure for tree
 */
@@ -8,6 +8,7 @@
 #include <xgboost/json.h>
 #include <xgboost/tree_model.h>

+#include <array>  // for array
 #include <cmath>
 #include <iomanip>
 #include <limits>
@@ -15,7 +16,7 @@
 #include <type_traits>

 #include "../common/categorical.h"
-#include "../common/common.h"  // for EscapeU8
+#include "../common/common.h"    // for EscapeU8
 #include "../predictor/predict_fn.h"
 #include "io_utils.h"  // for GetElem
 #include "param.h"
@@ -31,26 +32,50 @@ namespace tree {
 DMLC_REGISTER_PARAMETER(TrainParam);
 }

+namespace {
+template <typename Float>
+std::enable_if_t<std::is_floating_point_v<Float>, std::string> ToStr(Float value) {
+  int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
+  static_assert(std::is_floating_point<Float>::value,
+                "Use std::to_string instead for non-floating point values.");
+  std::stringstream ss;
+  ss << std::setprecision(kFloatMaxPrecision) << value;
+  return ss.str();
+}
+
+template <typename Float>
+std::string ToStr(linalg::VectorView<Float> value, bst_target_t limit) {
+  int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
+  static_assert(std::is_floating_point<Float>::value,
+                "Use std::to_string instead for non-floating point values.");
+  std::stringstream ss;
+  ss << std::setprecision(kFloatMaxPrecision);
+  if (value.Size() == 1) {
+    ss << value(0);
+    return ss.str();
+  }
+  CHECK_GE(limit, 2);
+  auto n = std::min(static_cast<bst_target_t>(value.Size() - 1), limit - 1);
+  ss << "[";
+  for (std::size_t i = 0; i < n; ++i) {
+    ss << value(i) << ", ";
+  }
+  if (value.Size() > limit) {
+    ss << "..., ";
+  }
+  ss << value(value.Size() - 1) << "]";
+  return ss.str();
+}
+}  // namespace
 /*!
 * \brief Base class for dump model implementation, modeling closely after code generator.
 */
 class TreeGenerator {
 protected:
-  static int32_t constexpr kFloatMaxPrecision =
-      std::numeric_limits<bst_float>::max_digits10;
  FeatureMap const& fmap_;
  std::stringstream ss_;
  bool const with_stats_;

-  template <typename Float>
-  static std::string ToStr(Float value) {
-    static_assert(std::is_floating_point<Float>::value,
-                  "Use std::to_string instead for non-floating point values.");
-    std::stringstream ss;
-    ss << std::setprecision(kFloatMaxPrecision) << value;
-    return ss.str();
-  }
-
  static std::string Tabs(uint32_t n) {
    std::string res;
    for (uint32_t i = 0; i < n; ++i) {
@@ -258,10 +283,10 @@ class TextGenerator : public TreeGenerator {
        kLeafTemplate,
        {{"{tabs}",  SuperT::Tabs(depth)},
         {"{nid}",   std::to_string(nid)},
-         {"{leaf}",  SuperT::ToStr(tree[nid].LeafValue())},
+         {"{leaf}",  ToStr(tree[nid].LeafValue())},
         {"{stats}", with_stats_ ?
          SuperT::Match(kStatTemplate,
-                        {{"{cover}", SuperT::ToStr(tree.Stat(nid).sum_hess)}}) : ""}});
+                        {{"{cover}", ToStr(tree.Stat(nid).sum_hess)}}) : ""}});
    return result;
  }

@@ -311,14 +336,14 @@ class TextGenerator : public TreeGenerator {
    static std::string const kQuantitiveTemplate =
        "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
    auto cond = tree[nid].SplitCond();
-    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, ToStr(cond), depth);
  }

  std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
    auto cond = tree[nid].SplitCond();
    static std::string const kNodeTemplate =
        "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
-    return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth);
  }

  std::string Categorical(RegTree const &tree, int32_t nid,
@@ -336,8 +361,8 @@ class TextGenerator : public TreeGenerator {
    static std::string const kStatTemplate = ",gain={loss_chg},cover={sum_hess}";
    std::string const result = SuperT::Match(
        kStatTemplate,
-        {{"{loss_chg}", SuperT::ToStr(tree.Stat(nid).loss_chg)},
-         {"{sum_hess}", SuperT::ToStr(tree.Stat(nid).sum_hess)}});
+        {{"{loss_chg}", ToStr(tree.Stat(nid).loss_chg)},
+         {"{sum_hess}", ToStr(tree.Stat(nid).sum_hess)}});
    return result;
  }

@@ -393,11 +418,11 @@ class JsonGenerator : public TreeGenerator {
    std::string result = SuperT::Match(
        kLeafTemplate,
        {{"{nid}",  std::to_string(nid)},
-         {"{leaf}", SuperT::ToStr(tree[nid].LeafValue())},
+         {"{leaf}", ToStr(tree[nid].LeafValue())},
         {"{stat}", with_stats_ ? SuperT::Match(
             kStatTemplate,
             {{"{sum_hess}",
-               SuperT::ToStr(tree.Stat(nid).sum_hess)}})  : ""}});
+               ToStr(tree.Stat(nid).sum_hess)}})  : ""}});
    return result;
  }

@@ -468,7 +493,7 @@ class JsonGenerator : public TreeGenerator {
        R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
        R"I("missing": {missing})I";
    bst_float cond = tree[nid].SplitCond();
-    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, ToStr(cond), depth);
  }

  std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
@@ -477,7 +502,7 @@ class JsonGenerator : public TreeGenerator {
        R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
        R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
        R"I("missing": {missing})I";
-    return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth);
  }

  std::string NodeStat(RegTree const& tree, int32_t nid) const override {
@@ -485,8 +510,8 @@ class JsonGenerator : public TreeGenerator {
        R"S(, "gain": {loss_chg}, "cover": {sum_hess})S";
    auto result = SuperT::Match(
        kStatTemplate,
-        {{"{loss_chg}", SuperT::ToStr(tree.Stat(nid).loss_chg)},
-         {"{sum_hess}", SuperT::ToStr(tree.Stat(nid).sum_hess)}});
+        {{"{loss_chg}", ToStr(tree.Stat(nid).loss_chg)},
+         {"{sum_hess}", ToStr(tree.Stat(nid).sum_hess)}});
    return result;
  }

@@ -622,11 +647,11 @@ class GraphvizGenerator : public TreeGenerator {

 protected:
  template <bool is_categorical>
-  std::string BuildEdge(RegTree const &tree, bst_node_t nid, int32_t child, bool left) const {
+  std::string BuildEdge(RegTree const &tree, bst_node_t nidx, int32_t child, bool left) const {
    static std::string const kEdgeTemplate =
        "    {nid} -> {child} [label=\"{branch}\" color=\"{color}\"]\n";
    // Is this the default child for missing value?
-    bool is_missing = tree[nid].DefaultChild() == child;
+    bool is_missing = tree.DefaultChild(nidx) == child;
    std::string branch;
    if (is_categorical) {
      branch = std::string{left ? "no" : "yes"} + std::string{is_missing ? ", missing" : ""};
@@ -635,7 +660,7 @@ class GraphvizGenerator : public TreeGenerator {
    }
    std::string buffer =
        SuperT::Match(kEdgeTemplate,
-                {{"{nid}", std::to_string(nid)},
+                {{"{nid}", std::to_string(nidx)},
                 {"{child}", std::to_string(child)},
                 {"{color}", is_missing ? param_.yes_color : param_.no_color},
                 {"{branch}", branch}});
@@ -644,68 +669,77 @@ class GraphvizGenerator : public TreeGenerator {

  // Only indicator is different, so we combine all different node types into this
  // function.
-  std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t) const override {
-    auto split_index = tree[nid].SplitIndex();
-    auto cond = tree[nid].SplitCond();
+  std::string PlainNode(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
+    auto split_index = tree.SplitIndex(nidx);
+    auto cond = tree.SplitCond(nidx);
    static std::string const kNodeTemplate = "    {nid} [ label=\"{fname}{<}{cond}\" {params}]\n";

    bool has_less =
        (split_index >= fmap_.Size()) || fmap_.TypeOf(split_index) != FeatureMap::kIndicator;
    std::string result =
-        SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nid)},
+        SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)},
                                      {"{fname}", GetFeatureName(fmap_, split_index)},
                                      {"{<}", has_less ? "<" : ""},
-                                      {"{cond}", has_less ? SuperT::ToStr(cond) : ""},
+                                      {"{cond}", has_less ? ToStr(cond) : ""},
                                      {"{params}", param_.condition_node_params}});

-    result += BuildEdge<false>(tree, nid, tree[nid].LeftChild(), true);
-    result += BuildEdge<false>(tree, nid, tree[nid].RightChild(), false);
+    result += BuildEdge<false>(tree, nidx, tree.LeftChild(nidx), true);
+    result += BuildEdge<false>(tree, nidx, tree.RightChild(nidx), false);

    return result;
  };

-  std::string Categorical(RegTree const& tree, int32_t nid, uint32_t) const override {
+  std::string Categorical(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
    static std::string const kLabelTemplate =
        "    {nid} [ label=\"{fname}:{cond}\" {params}]\n";
-    auto cats = GetSplitCategories(tree, nid);
+    auto cats = GetSplitCategories(tree, nidx);
    auto cats_str = PrintCatsAsSet(cats);
-    auto split_index = tree[nid].SplitIndex();
+    auto split_index = tree.SplitIndex(nidx);

    std::string result =
-        SuperT::Match(kLabelTemplate, {{"{nid}", std::to_string(nid)},
+        SuperT::Match(kLabelTemplate, {{"{nid}", std::to_string(nidx)},
                                       {"{fname}", GetFeatureName(fmap_, split_index)},
                                       {"{cond}", cats_str},
                                       {"{params}", param_.condition_node_params}});

-    result += BuildEdge<true>(tree, nid, tree[nid].LeftChild(), true);
-    result += BuildEdge<true>(tree, nid, tree[nid].RightChild(), false);
+    result += BuildEdge<true>(tree, nidx, tree.LeftChild(nidx), true);
+    result += BuildEdge<true>(tree, nidx, tree.RightChild(nidx), false);

    return result;
  }

-  std::string LeafNode(RegTree const& tree, int32_t nid, uint32_t) const override {
-    static std::string const kLeafTemplate =
-        "    {nid} [ label=\"leaf={leaf-value}\" {params}]\n";
-    auto result = SuperT::Match(kLeafTemplate, {
-        {"{nid}",        std::to_string(nid)},
-        {"{leaf-value}", ToStr(tree[nid].LeafValue())},
-        {"{params}",     param_.leaf_node_params}});
-    return result;
-  };
+  std::string LeafNode(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
+    static std::string const kLeafTemplate = "    {nid} [ label=\"leaf={leaf-value}\" {params}]\n";
+    // hardcoded limit to avoid dumping long arrays into dot graph.
+    bst_target_t constexpr kLimit{3};
+    if (tree.IsMultiTarget()) {
+      auto value = tree.GetMultiTargetTree()->LeafValue(nidx);
+      auto result = SuperT::Match(kLeafTemplate, {{"{nid}", std::to_string(nidx)},
+                                                  {"{leaf-value}", ToStr(value, kLimit)},
+                                                  {"{params}", param_.leaf_node_params}});
+      return result;
+    } else {
+      auto value = tree[nidx].LeafValue();
+      auto result = SuperT::Match(kLeafTemplate, {{"{nid}", std::to_string(nidx)},
+                                                  {"{leaf-value}", ToStr(value)},
+                                                  {"{params}", param_.leaf_node_params}});
+      return result;
+    }
+  }

-  std::string BuildTree(RegTree const& tree, int32_t nid, uint32_t depth) override {
-    if (tree[nid].IsLeaf()) {
-      return this->LeafNode(tree, nid, depth);
+  std::string BuildTree(RegTree const& tree, bst_node_t nidx, uint32_t depth) override {
+    if (tree.IsLeaf(nidx)) {
+      return this->LeafNode(tree, nidx, depth);
    }
    static std::string const kNodeTemplate = "{parent}\n{left}\n{right}";
-    auto node = tree.GetSplitTypes()[nid] == FeatureType::kCategorical
-                    ? this->Categorical(tree, nid, depth)
-                    : this->PlainNode(tree, nid, depth);
+    auto node = tree.GetSplitTypes()[nidx] == FeatureType::kCategorical
+                    ? this->Categorical(tree, nidx, depth)
+                    : this->PlainNode(tree, nidx, depth);
    auto result = SuperT::Match(
        kNodeTemplate,
        {{"{parent}", node},
-         {"{left}",   this->BuildTree(tree, tree[nid].LeftChild(), depth+1)},
-         {"{right}",  this->BuildTree(tree, tree[nid].RightChild(), depth+1)}});
+         {"{left}",   this->BuildTree(tree, tree.LeftChild(nidx), depth+1)},
+         {"{right}",  this->BuildTree(tree, tree.RightChild(nidx), depth+1)}});
    return result;
  }

@@ -733,7 +767,9 @@ XGBOOST_REGISTER_TREE_IO(GraphvizGenerator, "dot")
 constexpr bst_node_t RegTree::kRoot;

 std::string RegTree::DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const {
-  CHECK(!IsMultiTarget());
+  if (this->IsMultiTarget() && format != "dot") {
+    LOG(FATAL) << format << " tree dump " << MTNotImplemented();
+  }
  std::unique_ptr<TreeGenerator> builder{TreeGenerator::Create(format, fmap, with_stats)};
  builder->BuildTree(*this);

--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
 *
 * \brief Implementation for the approx tree method.
 */
@@ -107,7 +107,10 @@ class GloablApproxBuilder {
    for (auto const &g : gpair) {
      root_sum.Add(g);
    }
-    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
+    auto rc = collective::GlobalSum(ctx_, p_fmat->Info(),
+                                    linalg::MakeVec(reinterpret_cast<double *>(&root_sum), 2));
+    collective::SafeColl(rc);
+
    std::vector<CPUExpandEntry> nodes{best};
    this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
                                           linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -106,6 +106,9 @@ class ColMaker: public TreeUpdater {
    if (dmat->Info().HasCategorical()) {
      LOG(FATAL) << error::NoCategorical("Updater `grow_colmaker` or `exact` tree method");
    }
+    if (param->colsample_bynode - 1.0 != 0.0) {
+      LOG(FATAL) << "column sample by node is not yet supported by the exact tree method";
+    }
    this->LazyGetColumnDensity(dmat);
    // rescale learning rate according to size of trees
    interaction_constraints_.Configure(*param, dmat->Info().num_row_);
@@ -440,9 +443,8 @@ class ColMaker: public TreeUpdater {
    }

    // update the solution candidate
-    virtual void UpdateSolution(const SortedCSCPage &batch,
-                                const std::vector<bst_feature_t> &feat_set,
-                                const std::vector<GradientPair> &gpair, DMatrix *) {
+    void UpdateSolution(SortedCSCPage const &batch, const std::vector<bst_feature_t> &feat_set,
+                        const std::vector<GradientPair> &gpair) {
      // start enumeration
      const auto num_features = feat_set.size();
      CHECK(this->ctx_);
@@ -466,17 +468,15 @@ class ColMaker: public TreeUpdater {
            }
          });
    }
+
    // find splits at current level, do split per level
-    inline void FindSplit(int depth,
-                          const std::vector<int> &qexpand,
-                          const std::vector<GradientPair> &gpair,
-                          DMatrix *p_fmat,
-                          RegTree *p_tree) {
+    void FindSplit(bst_node_t depth, const std::vector<int> &qexpand,
+                   std::vector<GradientPair> const &gpair, DMatrix *p_fmat, RegTree *p_tree) {
      auto evaluator = tree_evaluator_.GetEvaluator();

      auto feat_set = column_sampler_->GetFeatureSet(depth);
      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
-        this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat);
+        this->UpdateSolution(batch, feat_set->HostVector(), gpair);
      }
      // after this each thread's stemp will get the best candidates, aggregate results
      this->SyncBestSolution(qexpand);
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
 */
 #include <thrust/copy.h>
 #include <thrust/reduce.h>
@@ -735,7 +735,9 @@ struct GPUHistMakerDevice {
        dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
                   GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
    using ReduceT = typename decltype(root_sum_quantised)::ValueT;
-    collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);
+    auto rc = collective::GlobalSum(
+        ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
+    collective::SafeColl(rc);

    hist.AllocateHistograms({kRootNIdx});
    this->BuildHist(kRootNIdx);
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
 * \file updater_quantile_hist.cc
 * \brief use quantized feature values to construct a tree
 * \author Philip Cho, Tianqi Checn, Egor Smirnov
@@ -149,9 +149,6 @@ class MultiTargetHistBuilder {
  }

  void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
-    if (collective::IsDistributed()) {
-      LOG(FATAL) << "Distributed training for vector-leaf is not yet supported.";
-    }
    monitor_->Start(__func__);

    p_last_fmat_ = p_fmat;
@@ -202,8 +199,10 @@ class MultiTargetHistBuilder {
      }
    }
    CHECK(root_sum.CContiguous());
-    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
-                          root_sum.Size() * 2);
+    auto rc = collective::GlobalSum(
+        ctx_, p_fmat->Info(),
+        linalg::MakeVec(reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2));
+    collective::SafeColl(rc);

    histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));

@@ -411,7 +410,9 @@ class HistUpdater {
        for (auto const &grad : gpair_h) {
          grad_stat.Add(grad.GetGrad(), grad.GetHess());
        }
-        collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
+        auto rc = collective::GlobalSum(ctx_, p_fmat->Info(),
+                                        linalg::MakeVec(reinterpret_cast<double *>(&grad_stat), 2));
+        collective::SafeColl(rc);
      }

      auto weight = evaluator_->InitRoot(GradStats{grad_stat});
@@ -474,6 +475,7 @@ class QuantileHistMaker : public TreeUpdater {
  std::unique_ptr<HistUpdater> p_impl_{nullptr};
  std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
  std::shared_ptr<common::ColumnSampler> column_sampler_;
+
  common::Monitor monitor_;
  ObjInfo const *task_{nullptr};
  HistMakerTrainParam hist_param_;