Merge branch 'master' into dev-hui

2023-03-08 00:39:33 +01:00
parent f286ae5bfa f236640427
commit ed45aa2816
221 changed files with 3122 additions and 1486 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -455,7 +455,8 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
  xgboost_CHECK_C_ARG_PTR(indptr);
  xgboost_CHECK_C_ARG_PTR(indices);
  xgboost_CHECK_C_ARG_PTR(data);
-  data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data}, nrow};
+  data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data},
+                                static_cast<std::size_t>(nrow)};
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023 by XGBoost contributors
 */
 #pragma once
 #include <string>
@@ -9,7 +9,7 @@
 namespace xgboost {
 namespace collective {

-/*!
+/**
 * \brief Initialize the collective communicator.
 *
 *  Currently the communicator API is experimental, function signatures may change in the future
@@ -140,6 +140,19 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
  }
 }

+/**
+ * @brief Gathers data from all processes and distributes it to all processes.
+ *
+ * This assumes all ranks have the same size, and input data has been sliced into the
+ * corresponding position.
+ *
+ * @param send_receive_buffer Buffer storing the data.
+ * @param size                Size of the data in bytes.
+ */
+inline void Allgather(void *send_receive_buffer, std::size_t size) {
+  Communicator::Get()->AllGather(send_receive_buffer, size);
+}
+
 /*!
 * \brief Perform in-place allreduce. This function is NOT thread-safe.
 *
@@ -197,7 +210,7 @@ inline void Allreduce(uint64_t *send_receive_buffer, size_t count) {
 template <Operation op, typename T,
          typename = std::enable_if_t<std::is_same<size_t, T>{} && !std::is_same<uint64_t, T>{}> >
 inline void Allreduce(T *send_receive_buffer, size_t count) {
-  static_assert(sizeof(T) == sizeof(uint64_t), "");
+  static_assert(sizeof(T) == sizeof(uint64_t));
  Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
 }

--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -1,10 +1,32 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_ALGORITHM_H_
 #define XGBOOST_COMMON_ALGORITHM_H_
-#include <algorithm>  // std::upper_bound
-#include <cinttypes>  // std::size_t
+#include <algorithm>          // upper_bound, stable_sort, sort, max
+#include <cinttypes>          // size_t
+#include <functional>         // less
+#include <iterator>           // iterator_traits, distance
+#include <vector>             // vector
+
+#include "numeric.h"          // Iota
+#include "xgboost/context.h"  // Context
+
+// clang with libstdc++ works as well
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
+    !defined(__APPLE__) && __has_include(<omp.h>)
+#define GCC_HAS_PARALLEL 1
+#endif  // GLIC_VERSION
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define MSVC_HAS_PARALLEL 1
+#endif  // MSC
+
+#if defined(GCC_HAS_PARALLEL)
+#include <parallel/algorithm>
+#elif defined(MSVC_HAS_PARALLEL)
+#include <ppl.h>
+#endif  // GLIBC VERSION

 namespace xgboost {
 namespace common {
@@ -13,6 +35,63 @@ auto SegmentId(It first, It last, Idx idx) {
  std::size_t segment_id = std::upper_bound(first, last, idx) - 1 - first;
  return segment_id;
 }
+
+template <typename Iter, typename Comp>
+void StableSort(Context const *ctx, Iter begin, Iter end, Comp &&comp) {
+  if (ctx->Threads() > 1) {
+#if defined(GCC_HAS_PARALLEL)
+    __gnu_parallel::stable_sort(begin, end, comp,
+                                __gnu_parallel::default_parallel_tag(ctx->Threads()));
+#else
+    // the only stable sort is radix sort for msvc ppl.
+    std::stable_sort(begin, end, comp);
+#endif  // GLIBC VERSION
+  } else {
+    std::stable_sort(begin, end, comp);
+  }
+}
+
+template <typename Iter, typename Comp>
+void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) {
+  if (ctx->Threads() > 1) {
+#if defined(GCC_HAS_PARALLEL)
+    __gnu_parallel::sort(begin, end, comp, __gnu_parallel::default_parallel_tag(ctx->Threads()));
+#elif defined(MSVC_HAS_PARALLEL)
+    auto n = std::distance(begin, end);
+    // use chunk size as hint to number of threads. No local policy/scheduler input with the
+    // concurrency module.
+    std::size_t chunk_size = n / ctx->Threads();
+    // 2048 is the default of msvc ppl as of v2022.
+    chunk_size = std::max(chunk_size, static_cast<std::size_t>(2048));
+    concurrency::parallel_sort(begin, end, comp, chunk_size);
+#else
+    std::sort(begin, end, comp);
+#endif  // GLIBC VERSION
+  } else {
+    std::sort(begin, end, comp);
+  }
+}
+
+template <typename Idx, typename Iter, typename V = typename std::iterator_traits<Iter>::value_type,
+          typename Comp = std::less<V>>
+std::vector<Idx> ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less<V>{}) {
+  CHECK(ctx->IsCPU());
+  auto n = std::distance(begin, end);
+  std::vector<Idx> result(n);
+  Iota(ctx, result.begin(), result.end(), 0);
+  auto op = [&](Idx const &l, Idx const &r) { return comp(begin[l], begin[r]); };
+  StableSort(ctx, result.begin(), result.end(), op);
+  return result;
+}
 }  // namespace common
 }  // namespace xgboost
+
+#if defined(GCC_HAS_PARALLEL)
+#undef GCC_HAS_PARALLEL
+#endif  // defined(GCC_HAS_PARALLEL)
+
+#if defined(MSVC_HAS_PARALLEL)
+#undef MSVC_HAS_PARALLEL
+#endif  // defined(MSVC_HAS_PARALLEL)
+
 #endif  // XGBOOST_COMMON_ALGORITHM_H_
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -42,9 +42,9 @@ constexpr inline bst_cat_t OutOfRangeCat() {

 inline XGBOOST_DEVICE bool InvalidCat(float cat) {
  constexpr auto kMaxCat = OutOfRangeCat();
-  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat, "");
-  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1, "");
-  static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat, "");
+  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat);
+  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1);
+  static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat);
  return cat < 0 || cat >= kMaxCat;
 }

--- a/src/common/charconv.cc
+++ b/src/common/charconv.cc
@@ -270,7 +270,9 @@ struct RyuPowLogUtils {
   */
  static uint32_t MulPow5InvDivPow2(const uint32_t m, const uint32_t q,
                                    const int32_t j) noexcept(true) {
-    return MulShift(m, kFloatPow5InvSplit[q], j);
+    static_assert(sizeof(kFloatPow5InvSplit) == 55 * sizeof(std::uint64_t));
+    assert(q < 55);
+    return MulShift(m, kFloatPow5InvSplit[q], j);  // NOLINT
  }

  /*
@@ -495,12 +497,10 @@ class PowerBaseComputer {
                             static_cast<int32_t>(IEEE754::kFloatBias) -
                             static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
                             static_cast<int32_t>(2);
-      static_assert(static_cast<int32_t>(1) -
-                            static_cast<int32_t>(IEEE754::kFloatBias) -
-                            static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
-                            static_cast<int32_t>(2) ==
-                        -151,
-                    "");
+      static_assert(static_cast<int32_t>(1) - static_cast<int32_t>(IEEE754::kFloatBias) -
+                        static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
+                        static_cast<int32_t>(2) ==
+                    -151);
      mantissa_base2 = f.mantissa;
    } else {
      base2_range.exponent = static_cast<int32_t>(f.exponent) - IEEE754::kFloatBias -
@@ -544,7 +544,7 @@ class RyuPrinter {
    // Function precondition: v is not a 10-digit number.
    // (f2s: 9 digits are sufficient for round-tripping.)
    // (d2fixed: We print 9-digit blocks.)
-    static_assert(100000000 == Tens(8), "");
+    static_assert(100000000 == Tens(8));
    assert(v < Tens(9));
    if (v >= Tens(8)) {
      return 9;
@@ -911,7 +911,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
  // the bias and also special-case the value 0.
  int32_t shift = (f_e2 == 0 ? 1 : f_e2) - exp_b2 - IEEE754::kFloatBias -
                  IEEE754::kFloatMantissaBits;
-  assert(shift >= 0);
+  assert(shift >= 1);

  // We need to round up if the exact value is more than 0.5 above the value we
  // computed. That's equivalent to checking if the last removed bit was 1 and
@@ -920,7 +920,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
  //
  // We need to update trailingZeros given that we have the exact output
  // exponent ieee_e2 now.
-  trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;
+  trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;  // NOLINT
  uint32_t lastRemovedBit = (mantissa_b2 >> (shift - 1)) & 1;
  bool roundup = (lastRemovedBit != 0) &&
                 (!trailing_zeros || (((mantissa_b2 >> shift) & 1) != 0));
--- a/src/common/charconv.h
+++ b/src/common/charconv.h
@@ -87,7 +87,7 @@ inline to_chars_result to_chars(char *first, char *last, int64_t value) { // NOL
  if (value < 0) {
    *first = '-';
    std::advance(first, 1);
-    unsigned_value = uint64_t(~value) + uint64_t(1);
+    unsigned_value = static_cast<uint64_t>(~value) + static_cast<uint64_t>(1);
  }
  return detail::ToCharsUnsignedImpl(first, last, unsigned_value);
 }
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
    feature_offsets_[fid] = accum_index;
  }

-  SetTypeSize(gmat.max_num_bins);
+  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
  index_.resize(storage_size, 0);
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -188,17 +188,6 @@ inline void SetDevice(std::int32_t device) {
 }
 #endif

-template <typename Idx, typename Container,
-          typename V = typename Container::value_type,
-          typename Comp = std::less<V>>
-std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
-  std::vector<Idx> result(array.size());
-  std::iota(result.begin(), result.end(), 0);
-  auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); };
-  XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
-  return result;
-}
-
 /**
 * Last index of a group in a CSR style of index pointer.
 */
@@ -206,31 +195,6 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
  return indptr[group + 1] - 1;
 }
-
-/**
- * \brief A CRTP (curiously recurring template pattern) helper function.
- *
- * https://www.fluentcpp.com/2017/05/19/crtp-helper/
- *
- * Does two things:
- * 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
- * 2. Avoids having to `static_cast` in a lot of places.
- *
- * \tparam T The derived class in a CRTP hierarchy.
- */
-template <typename T>
-struct Crtp {
-  T &Underlying() { return static_cast<T &>(*this); }
-  T const &Underlying() const { return static_cast<T const &>(*this); }
-};
-
-/**
- * \brief C++17 std::as_const
- */
-template <typename T>
-typename std::add_const<T>::type &AsConst(T &v) noexcept {  // NOLINT(runtime/references)
-  return v;
-}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -1,12 +1,13 @@
-/*!
- * Copyright 2017 by Contributors
+/**
+ * Copyright 2017-2023 by XGBoost Contributors
 * \file compressed_iterator.h
 */
 #pragma once
 #include <xgboost/base.h>
-#include <cmath>
-#include <cstddef>
+
 #include <algorithm>
+#include <cmath>
+#include <cstddef>  // for size_t

 #include "common.h"

@@ -36,7 +37,7 @@ static const int kPadding = 4;  // Assign padding so we can read slightly off
 // The number of bits required to represent a given unsigned range
 inline XGBOOST_DEVICE size_t SymbolBits(size_t num_symbols) {
  auto bits = std::ceil(log2(static_cast<double>(num_symbols)));
-  return common::Max(static_cast<size_t>(bits), size_t(1));
+  return common::Max(static_cast<size_t>(bits), static_cast<std::size_t>(1));
 }
 }  // namespace detail

--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -20,6 +20,7 @@

 #include <algorithm>
 #include <chrono>
+#include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_allocator.cuh>
 #include <numeric>
@@ -178,7 +179,7 @@ inline size_t MaxSharedMemory(int device_idx) {
  dh::safe_cuda(cudaDeviceGetAttribute
                (&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                 device_idx));
-  return size_t(max_shared_memory);
+  return static_cast<std::size_t>(max_shared_memory);
 }

 /**
@@ -195,7 +196,7 @@ inline size_t MaxSharedMemoryOptin(int device_idx) {
  dh::safe_cuda(cudaDeviceGetAttribute
                (&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlockOptin,
                 device_idx));
-  return size_t(max_shared_memory);
+  return static_cast<std::size_t>(max_shared_memory);
 }

 inline void CheckComputeCapability() {
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -46,7 +46,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
  if (!use_sorted) {
    HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
                                  HostSketchContainer::UseGroup(info),
-                                  m->Info().data_split_mode == DataSplitMode::kCol, n_threads);
+                                  m->IsColumnSplit(), n_threads);
    for (auto const& page : m->GetBatches<SparsePage>()) {
      container.PushRowPage(page, info, hessian);
    }
@@ -54,7 +54,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
  } else {
    SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
                                    HostSketchContainer::UseGroup(info),
-                                    m->Info().data_split_mode == DataSplitMode::kCol, n_threads};
+                                    m->IsColumnSplit(), n_threads};
    for (auto const& page : m->GetBatches<SortedCSCPage>()) {
      container.PushColPage(page, info, hessian);
    }
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -1,33 +1,31 @@
-/*!
- * Copyright 2018~2020 XGBoost contributors
+/**
+ * Copyright 2018~2023 by XGBoost contributors
 */
-
-#include <xgboost/logging.h>
-
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
+#include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
+#include <xgboost/logging.h>

+#include <cstddef>  // for size_t
 #include <memory>
 #include <mutex>
 #include <utility>
 #include <vector>

+#include "categorical.h"
 #include "device_helpers.cuh"
-#include "hist_util.h"
 #include "hist_util.cuh"
+#include "hist_util.h"
 #include "math.h"  // NOLINT
 #include "quantile.h"
-#include "categorical.h"
 #include "xgboost/host_device_vector.h"

-
 namespace xgboost {
 namespace common {

@@ -318,7 +316,7 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
    size_t batch_nnz = batch.data.Size();
    auto const& info = dmat->Info();
    for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch_nnz, size_t(begin + sketch_batch_num_elements));
+      size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
      if (has_weights) {
        bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
        dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 XGBoost contributors
+/**
+ * Copyright 2020-2023 by XGBoost contributors
 *
 * \brief Front end and utilities for GPU based sketching.  Works on sliding window
 *        instead of stream.
@@ -9,11 +9,13 @@

 #include <thrust/host_vector.h>

+#include <cstddef>  // for size_t
+
+#include "../data/device_adapter.cuh"
+#include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
-#include "device_helpers.cuh"
 #include "timer.h"
-#include "../data/device_adapter.cuh"

 namespace xgboost {
 namespace common {
@@ -304,7 +306,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
        device, num_cuts_per_feature, true);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
+      size_t end =
+          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
      ProcessWeightedSlidingWindow(batch, info,
                                   num_cuts_per_feature,
                                   HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
@@ -316,7 +319,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
        device, num_cuts_per_feature, false);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
+      size_t end =
+          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
      ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
                           sketch_container, num_cuts_per_feature);
    }
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -50,7 +50,7 @@ size_t PeekableInStream::PeekRead(void* dptr, size_t size) {
  }
 }

-FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream), pointer_{0} {
+FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream) {
  size_t constexpr kInitialSize = 4096;
  size_t size{kInitialSize}, total{0};
  buffer_.clear();
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -27,8 +27,7 @@ using MemoryBufferStream = rabit::utils::MemoryBufferStream;
 */
 class PeekableInStream : public dmlc::Stream {
 public:
-  explicit PeekableInStream(dmlc::Stream* strm)
-      : strm_(strm), buffer_ptr_(0) {}
+  explicit PeekableInStream(dmlc::Stream* strm) : strm_(strm) {}

  size_t Read(void* dptr, size_t size) override;
  virtual size_t PeekRead(void* dptr, size_t size);
@@ -41,7 +40,7 @@ class PeekableInStream : public dmlc::Stream {
  /*! \brief input stream */
  dmlc::Stream *strm_;
  /*! \brief current buffer pointer */
-  size_t buffer_ptr_;
+  size_t buffer_ptr_{0};
  /*! \brief internal buffer */
  std::string buffer_;
 };
@@ -72,7 +71,7 @@ class FixedSizeStream : public PeekableInStream {
  void Take(std::string* out);

 private:
-  size_t pointer_;
+  size_t pointer_{0};
  std::string buffer_;
 };

--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -710,10 +710,10 @@ void Json::Dump(Json json, JsonWriter* writer) {
  writer->Save(json);
 }

-static_assert(std::is_nothrow_move_constructible<Json>::value, "");
-static_assert(std::is_nothrow_move_constructible<Object>::value, "");
-static_assert(std::is_nothrow_move_constructible<Array>::value, "");
-static_assert(std::is_nothrow_move_constructible<String>::value, "");
+static_assert(std::is_nothrow_move_constructible<Json>::value);
+static_assert(std::is_nothrow_move_constructible<Object>::value);
+static_assert(std::is_nothrow_move_constructible<Array>::value);
+static_assert(std::is_nothrow_move_constructible<String>::value);

 Json UBJReader::ParseArray() {
  auto marker = PeekNextChar();
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@@ -14,7 +14,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
  if (ctx->IsCPU()) {
    auto const& h_values = values.ConstHostVector();
    auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
-    static_assert(std::is_same<decltype(result), double>::value, "");
+    static_assert(std::is_same<decltype(result), double>::value);
    return result;
  }
  return cuda_impl::Reduce(ctx, values);
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -42,8 +42,8 @@ void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {
 */
 template <typename InIt, typename OutIt, typename T>
 void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
-  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value, "");
-  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value, "");
+  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value);
+  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value);
  // The number of threads is pegged to the batch size. If the OMP block is parallelized
  // on anything other than the batch/block size, it should be reassigned
  auto n = static_cast<size_t>(std::distance(begin, end));
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -31,6 +31,8 @@ namespace common {
 // BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
 template<size_t BlockSize>
 class PartitionBuilder {
+  using BitVector = RBitField8;
+
 public:
  template<typename Func>
  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
@@ -121,27 +123,11 @@ class PartitionBuilder {
    bool default_left = tree[nid].DefaultLeft();
    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
    auto node_cats = tree.NodeCats(nid);
-
-    auto const& index = gmat.index;
    auto const& cut_values = gmat.cut.Values();
-    auto const& cut_ptrs = gmat.cut.Ptrs();
-
-    auto gidx_calc = [&](auto ridx) {
-      auto begin = gmat.RowIdx(ridx);
-      if (gmat.IsDense()) {
-        return static_cast<bst_bin_t>(index[begin + fid]);
-      }
-      auto end = gmat.RowIdx(ridx + 1);
-      auto f_begin = cut_ptrs[fid];
-      auto f_end = cut_ptrs[fid + 1];
-      // bypassing the column matrix as we need the cut value instead of bin idx for categorical
-      // features.
-      return BinarySearchBin(begin, end, index, f_begin, f_end);
-    };

    auto pred_hist = [&](auto ridx, auto bin_id) {
      if (any_cat && is_cat) {
-        auto gidx = gidx_calc(ridx);
+        auto gidx = gmat.GetGindex(ridx, fid);
        bool go_left = default_left;
        if (gidx > -1) {
          go_left = Decision(node_cats, cut_values[gidx]);
@@ -153,7 +139,7 @@ class PartitionBuilder {
    };

    auto pred_approx = [&](auto ridx) {
-      auto gidx = gidx_calc(ridx);
+      auto gidx = gmat.GetGindex(ridx, fid);
      bool go_left = default_left;
      if (gidx > -1) {
        if (is_cat) {
@@ -199,6 +185,84 @@ class PartitionBuilder {
    SetNRightElems(node_in_set, range.begin(), n_right);
  }

+  /**
+   * @brief When data is split by column, we don't have all the features locally on the current
+   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
+   * to go right, or if the feature value used for the split is missing.
+   */
+  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::ColumnMatrix& column_matrix,
+                const RegTree& tree, const size_t* rid,
+                BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree[nid].SplitIndex();
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    if (!column_matrix.IsInitialized()) {
+      for (auto row_id : rid_span) {
+        auto gidx = gmat.GetGindex(row_id, fid);
+        if (gidx > -1) {
+          bool go_left = false;
+          if (is_cat) {
+            go_left = Decision(node_cats, cut_values[gidx]);
+          } else {
+            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+          }
+          if (go_left) {
+            decision_bits->Set(row_id - gmat.base_rowid);
+          }
+        } else {
+          missing_bits->Set(row_id - gmat.base_rowid);
+        }
+      }
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+  }
+
+  /**
+   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
+   * use them to partition the rows accordingly.
+   */
+  void PartitionByMask(const size_t node_in_set,
+                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
+                       const common::Range1d range, GHistIndexMatrix const& gmat,
+                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                       const size_t* rid, BitVector const& decision_bits,
+                       BitVector const& missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bool default_left = tree[nid].DefaultLeft();
+
+    auto pred_approx = [&](auto ridx) {
+      bool go_left = default_left;
+      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
+      if (!is_missing) {
+        go_left = decision_bits.Check(ridx - gmat.base_rowid);
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
  // allocate thread local memory, should be called for each specific task
  void AllocateForTask(size_t id) {
    if (mem_blocks_[id].get() == nullptr) {
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
 */
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -109,7 +109,7 @@ void PruneImpl(common::Span<SketchContainer::OffsetT const> cuts_ptr,
 template <typename T, typename U>
 void CopyTo(Span<T> out, Span<U> src) {
  CHECK_EQ(out.size(), src.size());
-  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value, "");
+  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
  dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                out.size_bytes(),
                                cudaMemcpyDefault));
@@ -143,7 +143,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
      thrust::make_zip_iterator(thrust::make_tuple(b_ind_iter, place_holder));

  dh::XGBCachingDeviceAllocator<Tuple> alloc;
-  static_assert(sizeof(Tuple) == sizeof(SketchEntry), "");
+  static_assert(sizeof(Tuple) == sizeof(SketchEntry));
  // We reuse the memory for storing merge path.
  common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
  // Determine the merge path, 0 if element is from x, 1 if it's from y.
--- a/src/common/random.cc
+++ b/src/common/random.cc
@@ -24,8 +24,9 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
    for (size_t i = 0; i < h_features.size(); ++i) {
      weights[i] = feature_weights_[h_features[i]];
    }
+    CHECK(ctx_);
    new_features.HostVector() =
-        WeightedSamplingWithoutReplacement(p_features->HostVector(), weights, n);
+        WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n);
  } else {
    new_features.Resize(features.size());
    std::copy(features.begin(), features.end(), new_features.HostVector().begin());
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -20,7 +20,9 @@
 #include <vector>

 #include "../collective/communicator-inl.h"
+#include "algorithm.h"  // ArgSort
 #include "common.h"
+#include "xgboost/context.h"  // Context
 #include "xgboost/host_device_vector.h"

 namespace xgboost {
@@ -87,8 +89,8 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
 * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
 */
 template <typename T>
-std::vector<T> WeightedSamplingWithoutReplacement(
-    std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
+std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vector<T> const& array,
+                                                  std::vector<float> const& weights, size_t n) {
  // ES sampling.
  CHECK_EQ(array.size(), weights.size());
  std::vector<float> keys(weights.size());
@@ -100,7 +102,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(
    auto k = std::log(u) / w;
    keys[i] = k;
  }
-  auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
+  auto ind = ArgSort<std::size_t>(ctx, keys.data(), keys.data() + keys.size(), std::greater<>{});
  ind.resize(n);

  std::vector<T> results(ind.size());
@@ -126,6 +128,7 @@ class ColumnSampler {
  float colsample_bytree_{1.0f};
  float colsample_bynode_{1.0f};
  GlobalRandomEngine rng_;
+  Context const* ctx_;

 public:
  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
@@ -157,12 +160,13 @@ class ColumnSampler {
   * \param colsample_bytree
   * \param skip_index_0      (Optional) True to skip index 0.
   */
-  void Init(int64_t num_col, std::vector<float> feature_weights, float colsample_bynode,
-            float colsample_bylevel, float colsample_bytree) {
+  void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
+            float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
    feature_weights_ = std::move(feature_weights);
    colsample_bylevel_ = colsample_bylevel;
    colsample_bytree_ = colsample_bytree;
    colsample_bynode_ = colsample_bynode;
+    ctx_ = ctx;

    if (feature_set_tree_ == nullptr) {
      feature_set_tree_ = std::make_shared<HostDeviceVector<bst_feature_t>>();
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -77,14 +77,14 @@ class RowSetCollection {
    if (row_indices_.empty()) {  // edge case: empty instance set
      constexpr size_t* kBegin = nullptr;
      constexpr size_t* kEnd = nullptr;
-      static_assert(kEnd - kBegin == 0, "");
-      elem_of_each_node_.emplace_back(Elem(kBegin, kEnd, 0));
+      static_assert(kEnd - kBegin == 0);
+      elem_of_each_node_.emplace_back(kBegin, kEnd, 0);
      return;
    }

    const size_t* begin = dmlc::BeginPtr(row_indices_);
    const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
-    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
+    elem_of_each_node_.emplace_back(begin, end, 0);
  }

  std::vector<size_t>* Data() { return &row_indices_; }
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -35,11 +35,11 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
    auto iter = linalg::cbegin(ti_v);
    float q{0};
    if (opt_weights.Empty()) {
-      q = common::Quantile(0.5, iter, iter + ti_v.Size());
+      q = common::Quantile(ctx, 0.5, iter, iter + ti_v.Size());
    } else {
      CHECK_NE(t_v.Shape(1), 0);
      auto w_it = common::MakeIndexTransformIter([&](std::size_t i) { return opt_weights[i]; });
-      q = common::WeightedQuantile(0.5, iter, iter + ti_v.Size(), w_it);
+      q = common::WeightedQuantile(ctx, 0.5, iter, iter + ti_v.Size(), w_it);
    }
    h_out(i) = q;
  }
--- a/src/common/stats.h
+++ b/src/common/stats.h
@@ -4,46 +4,52 @@
 #ifndef XGBOOST_COMMON_STATS_H_
 #define XGBOOST_COMMON_STATS_H_
 #include <algorithm>
-#include <iterator>
+#include <iterator>  // for distance
 #include <limits>
 #include <vector>

+#include "algorithm.h"           // for StableSort
 #include "common.h"              // AssertGPUSupport, OptionalWeights
 #include "optional_weight.h"     // OptionalWeights
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/context.h"     // Context
-#include "xgboost/linalg.h"
-#include "xgboost/logging.h"  // CHECK_GE
+#include "xgboost/linalg.h"      // TensorView,VectorView
+#include "xgboost/logging.h"     // CHECK_GE

 namespace xgboost {
 namespace common {

 /**
- * \brief Percentile with masked array using linear interpolation.
+ * @brief Quantile using linear interpolation.
 *
 *   https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
 *
- * \param alpha Percentile, must be in range [0, 1].
+ * \param alpha Quantile, must be in range [0, 1].
 * \param begin Iterator begin for input array.
 * \param end   Iterator end for input array.
 *
 * \return The result of interpolation.
 */
 template <typename Iter>
-float Quantile(double alpha, Iter const& begin, Iter const& end) {
+float Quantile(Context const* ctx, double alpha, Iter const& begin, Iter const& end) {
  CHECK(alpha >= 0 && alpha <= 1);
  auto n = static_cast<double>(std::distance(begin, end));
  if (n == 0) {
    return std::numeric_limits<float>::quiet_NaN();
  }

-  std::vector<size_t> sorted_idx(n);
+  std::vector<std::size_t> sorted_idx(n);
  std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
-  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
-                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+  if (omp_in_parallel()) {
+    std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
+                     [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  } else {
+    StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
+               [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  }

  auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
-  static_assert(std::is_same<decltype(val(0)), float>::value, "");
+  static_assert(std::is_same<decltype(val(0)), float>::value);

  if (alpha <= (1 / (n + 1))) {
    return val(0);
@@ -51,7 +57,7 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
  if (alpha >= (n / (n + 1))) {
    return val(sorted_idx.size() - 1);
  }
-  assert(n != 0 && "The number of rows in a leaf can not be zero.");
+
  double x = alpha * static_cast<double>((n + 1));
  double k = std::floor(x) - 1;
  CHECK_GE(k, 0);
@@ -66,30 +72,35 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
 * \brief Calculate the weighted quantile with step function. Unlike the unweighted
 *        version, no interpolation is used.
 *
- *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussion on computing
+ *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussions on computing
 *   weighted quantile with interpolation.
 */
 template <typename Iter, typename WeightIter>
-float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
+float WeightedQuantile(Context const* ctx, double alpha, Iter begin, Iter end, WeightIter w_begin) {
  auto n = static_cast<double>(std::distance(begin, end));
  if (n == 0) {
    return std::numeric_limits<float>::quiet_NaN();
  }
  std::vector<size_t> sorted_idx(n);
  std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
-  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
-                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+  if (omp_in_parallel()) {
+    std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
+                     [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  } else {
+    StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
+               [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  }

  auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };

  std::vector<float> weight_cdf(n);  // S_n
  // weighted cdf is sorted during construction
-  weight_cdf[0] = *(weights + sorted_idx[0]);
+  weight_cdf[0] = *(w_begin + sorted_idx[0]);
  for (size_t i = 1; i < n; ++i) {
-    weight_cdf[i] = weight_cdf[i - 1] + *(weights + sorted_idx[i]);
+    weight_cdf[i] = weight_cdf[i - 1] + w_begin[sorted_idx[i]];
  }
  float thresh = weight_cdf.back() * alpha;
-  size_t idx =
+  std::size_t idx =
      std::lower_bound(weight_cdf.cbegin(), weight_cdf.cend(), thresh) - weight_cdf.cbegin();
  idx = std::min(idx, static_cast<size_t>(n - 1));
  return val(idx);
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -10,12 +10,13 @@
 #include <cstring>

 #include "../collective/communicator-inl.h"
+#include "../common/algorithm.h"  // StableSort
 #include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
 #include "../common/group_data.h"
 #include "../common/io.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
-#include "../common/numeric.h"
+#include "../common/numeric.h"  // Iota
 #include "../common/threading_utils.h"
 #include "../common/version.h"
 #include "../data/adapter.h"
@@ -258,6 +259,19 @@ void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<Feat
  }
 }

+const std::vector<size_t>& MetaInfo::LabelAbsSort(Context const* ctx) const {
+  if (label_order_cache_.size() == labels.Size()) {
+    return label_order_cache_;
+  }
+  label_order_cache_.resize(labels.Size());
+  common::Iota(ctx, label_order_cache_.begin(), label_order_cache_.end(), 0);
+  const auto& l = labels.Data()->HostVector();
+  common::StableSort(ctx, label_order_cache_.begin(), label_order_cache_.end(),
+                     [&l](size_t i1, size_t i2) { return std::abs(l[i1]) < std::abs(l[i2]); });
+
+  return label_order_cache_;
+}
+
 void MetaInfo::LoadBinary(dmlc::Stream *fi) {
  auto version = Version::Load(fi);
  auto major = std::get<0>(version);
@@ -898,6 +912,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
    if (!cache_file.empty()) {
      LOG(FATAL) << "Column-wise data split is not support for external memory.";
    }
+    LOG(CONSOLE) << "Splitting data by column";
    auto* sliced = dmat->SliceCol(npart, partid);
    delete dmat;
    return sliced;
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -1,12 +1,14 @@
-/*!
- *  Copyright (c) 2019 by Contributors
+/**
+ *  Copyright 2019-2023 by XGBoost Contributors
 * \file device_adapter.cuh
 */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
+#include <cstddef>  // for size_t
 #include <limits>
 #include <memory>
 #include <string>
+
 #include "../common/device_helpers.cuh"
 #include "../common/math.h"
 #include "adapter.h"
@@ -205,10 +207,10 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
    }
  });
  dh::XGBCachingDeviceAllocator<char> alloc;
-  size_t row_stride = dh::Reduce(
-      thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0),
-      thrust::maximum<size_t>());
+  size_t row_stride =
+      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+                 thrust::device_pointer_cast(offset.data()) + offset.size(),
+                 static_cast<std::size_t>(0), thrust::maximum<size_t>());
  return row_stride;
 }
 };  // namespace data
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -21,13 +21,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM

 GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
                                   double sparse_thresh, bool sorted_sketch, int32_t n_threads,
-                                   common::Span<float> hess) {
+                                   common::Span<float> hess)
+    : max_numeric_bins_per_feat{max_bins_per_feat} {
  CHECK(p_fmat->SingleColBlock());
  // We use sorted sketching for approx tree method since it's more efficient in
  // computation time (but higher memory usage).
  cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);

-  max_num_bins = max_bins_per_feat;
  const uint32_t nbins = cut.Ptrs().back();
  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(n_threads * nbins, 0);
@@ -64,7 +64,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
    : row_ptr(info.num_row_ + 1, 0),
      hit_count(cuts.TotalBins(), 0),
      cut{std::forward<common::HistogramCuts>(cuts)},
-      max_num_bins(max_bin_per_feat),
+      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}

 #if !defined(XGBOOST_USE_CUDA)
@@ -87,13 +87,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
 }

 GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
-                                   common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
-                                   bool isDense, double sparse_thresh, int32_t n_threads) {
+                                   common::HistogramCuts cuts, int32_t max_bins_per_feat,
+                                   bool isDense, double sparse_thresh, int32_t n_threads)
+    : cut{std::move(cuts)},
+      max_numeric_bins_per_feat{max_bins_per_feat},
+      base_rowid{batch.base_rowid},
+      isDense_{isDense} {
  CHECK_GE(n_threads, 1);
-  base_rowid = batch.base_rowid;
-  isDense_ = isDense;
-  cut = cuts;
-  max_num_bins = max_bins_per_feat;
  CHECK_EQ(row_ptr.size(), 0);
  // The number of threads is pegged to the batch size. If the OMP
  // block is parallelized on anything other than the batch/block size,
@@ -128,12 +128,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH

 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
-  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
+      isDense) {
    // compress dense index to uint8
    index.SetBinTypeSize(common::kUint8BinsTypeSize);
    index.Resize((sizeof(uint8_t)) * n_index);
-  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
-              max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
+  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
+              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
             isDense) {
    // compress dense index to uint16
    index.SetBinTypeSize(common::kUint16BinsTypeSize);
@@ -149,16 +150,24 @@ common::ColumnMatrix const &GHistIndexMatrix::Transpose() const {
  return *columns_;
 }

+bst_bin_t GHistIndexMatrix::GetGindex(size_t ridx, size_t fidx) const {
+  auto begin = RowIdx(ridx);
+  if (IsDense()) {
+    return static_cast<bst_bin_t>(index[begin + fidx]);
+  }
+  auto end = RowIdx(ridx + 1);
+  auto const& cut_ptrs = cut.Ptrs();
+  auto f_begin = cut_ptrs[fidx];
+  auto f_end = cut_ptrs[fidx + 1];
+  return BinarySearchBin(begin, end, index, f_begin, f_end);
+}
+
 float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
  auto const &values = cut.Values();
  auto const &mins = cut.MinValues();
  auto const &ptrs = cut.Ptrs();
  if (is_cat) {
-    auto f_begin = ptrs[fidx];
-    auto f_end = ptrs[fidx + 1];
-    auto begin = RowIdx(ridx);
-    auto end = RowIdx(ridx + 1);
-    auto gidx = BinarySearchBin(begin, end, index, f_begin, f_end);
+    auto gidx = GetGindex(ridx, fidx);
    if (gidx == -1) {
      return std::numeric_limits<float>::quiet_NaN();
    }
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,

 GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
                                   EllpackPage const& in_page, BatchParam const& p)
-    : max_num_bins{p.max_bin} {
+    : max_numeric_bins_per_feat{p.max_bin} {
  auto page = in_page.Impl();
  isDense_ = page->is_dense;

--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -134,11 +134,15 @@ class GHistIndexMatrix {
  std::vector<size_t> hit_count;
  /*! \brief The corresponding cuts */
  common::HistogramCuts cut;
-  /*! \brief max_bin for each feature. */
-  bst_bin_t max_num_bins;
+  /** \brief max_bin for each feature. */
+  bst_bin_t max_numeric_bins_per_feat;
  /*! \brief base row index for current page (used by external memory) */
  size_t base_rowid{0};

+  bst_bin_t MaxNumBinPerFeat() const {
+    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
+  }
+
  ~GHistIndexMatrix();
  /**
   * \brief Constrcutor for SimpleDMatrix.
@@ -161,7 +165,7 @@ class GHistIndexMatrix {
   * \brief Constructor for external memory.
   */
  GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
-                   common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
+                   common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
                   double sparse_thresh, int32_t n_threads);
  GHistIndexMatrix();  // also for ext mem, empty ctor so that we can read the cache back.

@@ -224,6 +228,8 @@ class GHistIndexMatrix {

  common::ColumnMatrix const& Transpose() const;

+  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+
  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;

 private:
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
    if (!fi->Read(&page->hit_count)) {
      return false;
    }
-    if (!fi->Read(&page->max_num_bins)) {
+    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
      return false;
    }
    if (!fi->Read(&page->base_rowid)) {
@@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
        sizeof(uint64_t);
    // max_bins, base row, is_dense
-    fo->Write(page.max_num_bins);
-    bytes += sizeof(page.max_num_bins);
+    fo->Write(page.max_numeric_bins_per_feat);
+    bytes += sizeof(page.max_numeric_bins_per_feat);
    fo->Write(page.base_rowid);
    bytes += sizeof(page.base_rowid);
    fo->Write(page.IsDense());
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -213,7 +213,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
        SyncFeatureType(&h_ft);
        p_sketch.reset(new common::HostSketchContainer{
            batch_param_.max_bin, h_ft, column_sizes, !proxy->Info().group_ptr_.empty(),
-            proxy->Info().data_split_mode == DataSplitMode::kCol, ctx_.Threads()});
+            proxy->IsColumnSplit(), ctx_.Threads()});
      }
      HostAdapterDispatch(proxy, [&](auto const& batch) {
        proxy->Info().num_nonzero_ = batch_nnz[i];
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -19,7 +19,7 @@ const MetaInfo &SparsePageDMatrix::Info() const { return info_; }

 namespace detail {
 // Use device dispatch
-std::size_t NSamplesDevice(DMatrixProxy *)
+std::size_t NSamplesDevice(DMatrixProxy *)  // NOLINT
 #if defined(XGBOOST_USE_CUDA)
 ;  // NOLINT
 #else
@@ -28,7 +28,7 @@ std::size_t NSamplesDevice(DMatrixProxy *)
  return 0;
 }
 #endif
-std::size_t NFeaturesDevice(DMatrixProxy *)
+std::size_t NFeaturesDevice(DMatrixProxy *)  // NOLINT
 #if defined(XGBOOST_USE_CUDA)
 ;  // NOLINT
 #else
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -75,10 +75,7 @@ class GBLinear : public GradientBooster {
      : GradientBooster{ctx},
        learner_model_param_{learner_model_param},
        model_{learner_model_param},
-        previous_model_{learner_model_param},
-        sum_instance_weight_(0),
-        sum_weight_complete_(false),
-        is_converged_(false) {}
+        previous_model_{learner_model_param} {}

  void Configure(const Args& cfg) override {
    if (model_.weight.size() == 0) {
@@ -344,10 +341,10 @@ class GBLinear : public GradientBooster {
  GBLinearModel previous_model_;
  GBLinearTrainParam param_;
  std::unique_ptr<LinearUpdater> updater_;
-  double sum_instance_weight_;
-  bool sum_weight_complete_;
+  double sum_instance_weight_{};
+  bool sum_weight_complete_{false};
  common::Monitor monitor_;
-  bool is_converged_;
+  bool is_converged_{false};
 };

 // register the objective functions
--- a/src/gbm/gblinear_model.h
+++ b/src/gbm/gblinear_model.h
@@ -47,12 +47,12 @@ class GBLinearModel : public Model {
  DeprecatedGBLinearModelParam param_;

 public:
-  int32_t num_boosted_rounds;
+  int32_t num_boosted_rounds{0};
  LearnerModelParam const* learner_model_param;

 public:
-  explicit GBLinearModel(LearnerModelParam const* learner_model_param) :
-      num_boosted_rounds{0}, learner_model_param {learner_model_param} {}
+  explicit GBLinearModel(LearnerModelParam const *learner_model_param)
+      : learner_model_param{learner_model_param} {}
  void Configure(Args const &) { }

  // weight for each of feature, bias is the last one
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -32,15 +32,14 @@
 #include "xgboost/string_view.h"
 #include "xgboost/tree_updater.h"

-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);

-void GBTree::Configure(const Args& cfg) {
+void GBTree::Configure(Args const& cfg) {
  this->cfg_ = cfg;
  std::string updater_seq = tparam_.updater_seq;
  tparam_.UpdateAllowUnknown(cfg);
+  tree_param_.UpdateAllowUnknown(cfg);

  model_.Configure(cfg);

@@ -235,9 +234,11 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
  CHECK_EQ(model_.param.num_parallel_tree, trees.size());
  CHECK_EQ(model_.param.num_parallel_tree, 1)
      << "Boosting random forest is not supported for current objective.";
+  CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
  for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
    auto const& position = node_position.at(tree_idx);
-    obj->UpdateTreeLeaf(position, p_fmat->Info(), predictions, group_idx, trees[tree_idx].get());
+    obj->UpdateTreeLeaf(position, p_fmat->Info(), tree_param_.learning_rate / trees.size(),
+                        predictions, group_idx, trees[tree_idx].get());
  }
 }

@@ -388,9 +389,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma

  CHECK(out_position);
  out_position->resize(new_trees.size());
+
+  // Rescale learning rate according to the size of trees
+  auto lr = tree_param_.learning_rate;
+  tree_param_.learning_rate /= static_cast<float>(new_trees.size());
  for (auto& up : updaters_) {
-    up->Update(gpair, p_fmat, common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
+    up->Update(&tree_param_, gpair, p_fmat,
+               common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
  }
+  tree_param_.learning_rate = lr;
 }

 void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
@@ -404,6 +411,8 @@ void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& ne
 void GBTree::LoadConfig(Json const& in) {
  CHECK_EQ(get<String>(in["name"]), "gbtree");
  FromJson(in["gbtree_train_param"], &tparam_);
+  FromJson(in["tree_train_param"], &tree_param_);
+
  // Process type cannot be kUpdate from loaded model
  // This would cause all trees to be pushed to trees_to_update
  // e.g. updating a model, then saving and loading it would result in an empty model
@@ -451,6 +460,7 @@ void GBTree::SaveConfig(Json* p_out) const {
  auto& out = *p_out;
  out["name"] = String("gbtree");
  out["gbtree_train_param"] = ToJson(tparam_);
+  out["tree_train_param"] = ToJson(tree_param_);

  // Process type cannot be kUpdate from loaded model
  // This would cause all trees to be pushed to trees_to_update
@@ -1058,5 +1068,4 @@ XGBOOST_REGISTER_GBM(Dart, "dart")
      GBTree* p = new Dart(booster_config, ctx);
      return p;
    });
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -20,6 +20,7 @@

 #include "../common/common.h"
 #include "../common/timer.h"
+#include "../tree/param.h"  // TrainParam
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -405,8 +406,8 @@ class GBTree : public GradientBooster {
        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
  }

-  std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
-                                     std::string format) const override {
+  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
+                                                   std::string format) const override {
    return model_.DumpModel(fmap, with_stats, this->ctx_->Threads(), format);
  }

@@ -428,6 +429,8 @@ class GBTree : public GradientBooster {
  GBTreeModel model_;
  // training parameter
  GBTreeTrainParam tparam_;
+  // Tree training parameter
+  tree::TrainParam tree_param_;
  // ----training fields----
  bool showed_updater_warning_ {false};
  bool specified_updater_   {false};
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -21,7 +21,7 @@
 #include <sstream>
 #include <stack>
 #include <string>
-#include <utility>
+#include <utility>  // for as_const
 #include <vector>

 #include "collective/communicator-inl.h"
@@ -257,11 +257,11 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
    : LearnerModelParam{user_param, t} {
  std::swap(base_score_, base_margin);
  // Make sure read access everywhere for thread-safe prediction.
-  common::AsConst(base_score_).HostView();
+  std::as_const(base_score_).HostView();
  if (!ctx->IsCPU()) {
-    common::AsConst(base_score_).View(ctx->gpu_id);
+    std::as_const(base_score_).View(ctx->gpu_id);
  }
-  CHECK(common::AsConst(base_score_).Data()->HostCanRead());
+  CHECK(std::as_const(base_score_).Data()->HostCanRead());
 }

 linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device) const {
@@ -287,9 +287,9 @@ void LearnerModelParam::Copy(LearnerModelParam const& that) {
  base_score_.Reshape(that.base_score_.Shape());
  base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
  base_score_.Data()->Copy(*that.base_score_.Data());
-  common::AsConst(base_score_).HostView();
+  std::as_const(base_score_).HostView();
  if (that.base_score_.DeviceIdx() != Context::kCpuId) {
-    common::AsConst(base_score_).View(that.base_score_.DeviceIdx());
+    std::as_const(base_score_).View(that.base_score_.DeviceIdx());
  }
  CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
  CHECK(base_score_.Data()->HostCanRead());
@@ -328,9 +328,6 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
    dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;

-using ThreadLocalPredictionCache =
-    dmlc::ThreadLocalStore<std::map<Learner const *, PredictionContainer>>;
-
 namespace {
 StringView ModelMsg() {
  return StringView{
@@ -368,6 +365,8 @@ class LearnerConfiguration : public Learner {
  LearnerModelParam learner_model_param_;
  LearnerTrainParam tparam_;
  // Initial prediction.
+  PredictionContainer prediction_container_;
+
  std::vector<std::string> metric_names_;

  void ConfigureModelParamWithoutBaseScore() {
@@ -426,22 +425,15 @@ class LearnerConfiguration : public Learner {
  }

 public:
-  explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix> > cache)
+  explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix>> cache)
      : need_configuration_{true} {
    monitor_.Init("Learner");
-    auto& local_cache = (*ThreadLocalPredictionCache::Get())[this];
    for (std::shared_ptr<DMatrix> const& d : cache) {
      if (d) {
-        local_cache.Cache(d, Context::kCpuId);
+        prediction_container_.Cache(d, Context::kCpuId);
      }
    }
  }
-  ~LearnerConfiguration() override {
-    auto local_cache = ThreadLocalPredictionCache::Get();
-    if (local_cache->find(this) != local_cache->cend()) {
-      local_cache->erase(this);
-    }
-  }

  // Configuration before data is known.
  void Configure() override {
@@ -499,10 +491,6 @@ class LearnerConfiguration : public Learner {
    CHECK_NE(learner_model_param_.BaseScore(this->Ctx()).Size(), 0) << ModelNotFitted();
  }

-  virtual PredictionContainer* GetPredictionCache() const {
-    return &((*ThreadLocalPredictionCache::Get())[this]);
-  }
-
  void LoadConfig(Json const& in) override {
    // If configuration is loaded, ensure that the model came from the same version
    CHECK(IsA<Object>(in));
@@ -741,11 +729,10 @@ class LearnerConfiguration : public Learner {
    if (mparam_.num_feature == 0) {
      // TODO(hcho3): Change num_feature to 64-bit integer
      unsigned num_feature = 0;
-      auto local_cache = this->GetPredictionCache();
-      for (auto& matrix : local_cache->Container()) {
-        CHECK(matrix.first);
+      for (auto const& matrix : prediction_container_.Container()) {
+        CHECK(matrix.first.ptr);
        CHECK(!matrix.second.ref.expired());
-        const uint64_t num_col = matrix.first->Info().num_col_;
+        const uint64_t num_col = matrix.first.ptr->Info().num_col_;
        CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
            << "Unfortunately, XGBoost does not support data matrices with "
            << std::numeric_limits<unsigned>::max() << " features or greater";
@@ -817,13 +804,13 @@ class LearnerConfiguration : public Learner {
   */
  void ConfigureTargets() {
    CHECK(this->obj_);
-    auto const& cache = this->GetPredictionCache()->Container();
+    auto const& cache = prediction_container_.Container();
    size_t n_targets = 1;
    for (auto const& d : cache) {
      if (n_targets == 1) {
-        n_targets = this->obj_->Targets(d.first->Info());
+        n_targets = this->obj_->Targets(d.first.ptr->Info());
      } else {
-        auto t = this->obj_->Targets(d.first->Info());
+        auto t = this->obj_->Targets(d.first.ptr->Info());
        CHECK(n_targets == t || 1 == t) << "Inconsistent labels.";
      }
    }
@@ -1275,8 +1262,7 @@ class LearnerImpl : public LearnerIO {

    this->ValidateDMatrix(train.get(), true);

-    auto local_cache = this->GetPredictionCache();
-    auto& predt = local_cache->Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);

    monitor_.Start("PredictRaw");
    this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1303,8 +1289,7 @@ class LearnerImpl : public LearnerIO {

    this->ValidateDMatrix(train.get(), true);

-    auto local_cache = this->GetPredictionCache();
-    auto& predt = local_cache->Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
    gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
    monitor_.Stop("BoostOneIter");
  }
@@ -1326,10 +1311,9 @@ class LearnerImpl : public LearnerIO {
      metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
    }

-    auto local_cache = this->GetPredictionCache();
    for (size_t i = 0; i < data_sets.size(); ++i) {
      std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = local_cache->Cache(m, ctx_.gpu_id);
+      auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
      this->ValidateDMatrix(m.get(), false);
      this->PredictRaw(m.get(), &predt, false, 0, 0);

@@ -1370,8 +1354,7 @@ class LearnerImpl : public LearnerIO {
    } else if (pred_leaf) {
      gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
    } else {
-      auto local_cache = this->GetPredictionCache();
-      auto& prediction = local_cache->Cache(data, ctx_.gpu_id);
+      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
      this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
      // Copy the prediction cache to output prediction. out_preds comes from C API
      out_preds->SetDevice(ctx_.gpu_id);
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -14,9 +14,11 @@
 #include <utility>
 #include <vector>

+#include "../common/algorithm.h"        // ArgSort
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "metric_common.h"              // MetricNoCache
+#include "xgboost/context.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/linalg.h"
 #include "xgboost/metric.h"
@@ -77,9 +79,8 @@ BinaryAUC(common::Span<float const> predts, linalg::VectorView<float const> labe
 *   Machine Learning Models
 */
 template <typename BinaryAUC>
-double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
-                     size_t n_classes, int32_t n_threads,
-                     BinaryAUC &&binary_auc) {
+double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
+                     size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
  CHECK_NE(n_classes, 0);
  auto const labels = info.labels.View(Context::kCpuId);
  if (labels.Shape(0) != 0) {
@@ -108,7 +109,7 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
      }
      double fp;
      std::tie(fp, tp(c), auc(c)) =
-          binary_auc(proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
+          binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
      local_area(c) = fp * tp(c);
    });
  }
@@ -139,23 +140,26 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
  return auc_sum;
 }

-std::tuple<double, double, double> BinaryROCAUC(common::Span<float const> predts,
+std::tuple<double, double, double> BinaryROCAUC(Context const *ctx,
+                                                common::Span<float const> predts,
                                                linalg::VectorView<float const> labels,
                                                common::OptionalWeights weights) {
-  auto const sorted_idx = common::ArgSort<size_t>(predts, std::greater<>{});
+  auto const sorted_idx =
+      common::ArgSort<size_t>(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{});
  return BinaryAUC(predts, labels, weights, sorted_idx, TrapezoidArea);
 }

 /**
 * Calculate AUC for 1 ranking group;
 */
-double GroupRankingROC(common::Span<float const> predts,
+double GroupRankingROC(Context const* ctx, common::Span<float const> predts,
                       linalg::VectorView<float const> labels, float w) {
  // on ranking, we just count all pairs.
  double auc{0};
  // argsort doesn't support tensor input yet.
  auto raw_labels = labels.Values().subspan(0, labels.Size());
-  auto const sorted_idx = common::ArgSort<size_t>(raw_labels, std::greater<>{});
+  auto const sorted_idx = common::ArgSort<size_t>(
+      ctx, raw_labels.data(), raw_labels.data() + raw_labels.size(), std::greater<>{});
  w = common::Sqr(w);

  double sum_w = 0.0f;
@@ -185,10 +189,11 @@ double GroupRankingROC(common::Span<float const> predts,
 *
 *   https://doi.org/10.1371/journal.pone.0092209
 */
-std::tuple<double, double, double> BinaryPRAUC(common::Span<float const> predts,
+std::tuple<double, double, double> BinaryPRAUC(Context const *ctx, common::Span<float const> predts,
                                               linalg::VectorView<float const> labels,
                                               common::OptionalWeights weights) {
-  auto const sorted_idx = common::ArgSort<size_t>(predts, std::greater<>{});
+  auto const sorted_idx =
+      common::ArgSort<size_t>(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{});
  double total_pos{0}, total_neg{0};
  for (size_t i = 0; i < labels.Size(); ++i) {
    auto w = weights[i];
@@ -211,9 +216,8 @@ std::tuple<double, double, double> BinaryPRAUC(common::Span<float const> predts,
 * Cast LTR problem to binary classification problem by comparing pairs.
 */
 template <bool is_roc>
-std::pair<double, uint32_t> RankingAUC(std::vector<float> const &predts,
-                                       MetaInfo const &info,
-                                       int32_t n_threads) {
+std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> const &predts,
+                                       MetaInfo const &info, int32_t n_threads) {
  CHECK_GE(info.group_ptr_.size(), 2);
  uint32_t n_groups = info.group_ptr_.size() - 1;
  auto s_predts = common::Span<float const>{predts};
@@ -237,9 +241,9 @@ std::pair<double, uint32_t> RankingAUC(std::vector<float> const &predts,
      auc = 0;
    } else {
      if (is_roc) {
-        auc = GroupRankingROC(g_predts, g_labels, w);
+        auc = GroupRankingROC(ctx, g_predts, g_labels, w);
      } else {
-        auc = std::get<2>(BinaryPRAUC(g_predts, g_labels, common::OptionalWeights{w}));
+        auc = std::get<2>(BinaryPRAUC(ctx, g_predts, g_labels, common::OptionalWeights{w}));
      }
      if (std::isnan(auc)) {
        invalid_groups++;
@@ -344,7 +348,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
    auto n_threads = ctx_->Threads();
    if (ctx_->gpu_id == Context::kCpuId) {
      std::tie(auc, valid_groups) =
-          RankingAUC<true>(predts.ConstHostVector(), info, n_threads);
+          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
    } else {
      std::tie(auc, valid_groups) =
          GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
@@ -358,8 +362,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
    auto n_threads = ctx_->Threads();
    CHECK_NE(n_classes, 0);
    if (ctx_->gpu_id == Context::kCpuId) {
-      auc = MultiClassOVR(predts.ConstHostVector(), info, n_classes, n_threads,
-                          BinaryROCAUC);
+      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
    } else {
      auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
    }
@@ -370,9 +373,9 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
  EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
    double fp, tp, auc;
    if (ctx_->gpu_id == Context::kCpuId) {
-      std::tie(fp, tp, auc) =
-          BinaryROCAUC(predts.ConstHostVector(), info.labels.HostView().Slice(linalg::All(), 0),
-                       common::OptionalWeights{info.weights_.ConstHostSpan()});
+      std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
+                                           info.labels.HostView().Slice(linalg::All(), 0),
+                                           common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
      std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
                                              ctx_->gpu_id, &this->d_cache_);
@@ -422,7 +425,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
    double pr, re, auc;
    if (ctx_->gpu_id == Context::kCpuId) {
      std::tie(pr, re, auc) =
-          BinaryPRAUC(predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
+          BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                      common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
      std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
@@ -435,8 +438,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
                        size_t n_classes) {
    if (ctx_->gpu_id == Context::kCpuId) {
      auto n_threads = this->ctx_->Threads();
-      return MultiClassOVR(predts.ConstHostSpan(), info, n_classes, n_threads,
-                           BinaryPRAUC);
+      return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
    } else {
      return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes);
    }
@@ -453,7 +455,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
        InvalidLabels();
      }
      std::tie(auc, valid_groups) =
-          RankingAUC<false>(predts.ConstHostVector(), info, n_threads);
+          RankingAUC<false>(ctx_, predts.ConstHostVector(), info, n_threads);
    } else {
      std::tie(auc, valid_groups) =
          GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_);
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -5,7 +5,7 @@

 #include <algorithm>
 #include <cassert>
-#include <cub/cub.cuh>
+#include <cub/cub.cuh>  // NOLINT
 #include <limits>
 #include <memory>
 #include <tuple>
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -451,9 +451,8 @@ class QuantileError : public MetricNoCache {
    auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
    std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
    CHECK_NE(n_targets, 0);
-    auto y_predt = linalg::MakeTensorView(
-        ctx->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan(),
-        {static_cast<std::size_t>(info.num_row_), alpha_.Size(), n_targets}, ctx->gpu_id);
+    auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
+                                          alpha_.Size(), n_targets);

    info.weights_.SetDevice(ctx->gpu_id);
    common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -6,6 +6,7 @@
 #define XGBOOST_METRIC_METRIC_COMMON_H_

 #include <limits>
+#include <memory>  // shared_ptr
 #include <string>

 #include "../common/common.h"
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -27,6 +27,7 @@
 #include <vector>

 #include "../collective/communicator-inl.h"
+#include "../common/algorithm.h"  // Sort
 #include "../common/math.h"
 #include "../common/ranking_utils.h"  // MakeMetricName
 #include "../common/threading_utils.h"
@@ -113,7 +114,7 @@ struct EvalAMS : public MetricNoCache {
    const auto &h_preds = preds.ConstHostVector();
    common::ParallelFor(ndata, ctx_->Threads(),
                        [&](bst_omp_uint i) { rec[i] = std::make_pair(h_preds[i], i); });
-    XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
+    common::Sort(ctx_, rec.begin(), rec.end(), common::CmpFirst);
    auto ntop = static_cast<unsigned>(ratio_ * ndata);
    if (ntop == 0) ntop = ndata;
    const double br = 10.0;
@@ -330,7 +331,7 @@ struct EvalCox : public MetricNoCache {
    using namespace std;  // NOLINT(*)

    const auto ndata = static_cast<bst_omp_uint>(info.labels.Size());
-    const auto &label_order = info.LabelAbsSort();
+    const auto &label_order = info.LabelAbsSort(ctx_);

    // pre-compute a sum for the denominator
    double exp_p_sum = 0;  // we use double because we might need the precision with large datasets
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -3,27 +3,34 @@
 */
 #include "adaptive.h"

-#include <limits>
-#include <vector>
+#include <algorithm>                       // std::transform,std::find_if,std::copy,std::unique
+#include <cmath>                           // std::isnan
+#include <cstddef>                         // std::size_t
+#include <iterator>                        // std::distance
+#include <vector>                          // std::vector

-#include "../common/common.h"
-#include "../common/numeric.h"
-#include "../common/stats.h"
-#include "../common/threading_utils.h"
+#include "../common/algorithm.h"           // ArgSort
+#include "../common/common.h"              // AssertGPUSupport
+#include "../common/numeric.h"             // RunLengthEncode
+#include "../common/stats.h"               // Quantile,WeightedQuantile
+#include "../common/threading_utils.h"     // ParallelFor
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
-#include "xgboost/linalg.h"
-#include "xgboost/tree_model.h"
+#include "xgboost/base.h"                  // bst_node_t
+#include "xgboost/context.h"               // Context
+#include "xgboost/data.h"                  // MetaInfo
+#include "xgboost/host_device_vector.h"    // HostDeviceVector
+#include "xgboost/linalg.h"                // MakeTensorView
+#include "xgboost/span.h"                  // Span
+#include "xgboost/tree_model.h"            // RegTree

-namespace xgboost {
-namespace obj {
-namespace detail {
-void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& position,
-                        std::vector<size_t>* p_nptr, std::vector<bst_node_t>* p_nidx,
-                        std::vector<size_t>* p_ridx) {
+namespace xgboost::obj::detail {
+void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
+                        std::vector<bst_node_t> const& position, std::vector<size_t>* p_nptr,
+                        std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_ridx) {
  auto& nptr = *p_nptr;
  auto& nidx = *p_nidx;
  auto& ridx = *p_ridx;
-  ridx = common::ArgSort<size_t>(position);
+  ridx = common::ArgSort<size_t>(ctx, position.cbegin(), position.cend());
  std::vector<bst_node_t> sorted_pos(position);
  // permutation
  for (size_t i = 0; i < position.size(); ++i) {
@@ -67,18 +74,18 @@ void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& posi
 }

 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info,
+                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                        HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
  auto& tree = *p_tree;

  std::vector<bst_node_t> nidx;
  std::vector<size_t> nptr;
  std::vector<size_t> ridx;
-  EncodeTreeLeafHost(*p_tree, position, &nptr, &nidx, &ridx);
+  EncodeTreeLeafHost(ctx, *p_tree, position, &nptr, &nidx, &ridx);
  size_t n_leaf = nidx.size();
  if (nptr.empty()) {
    std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx, p_tree);
+    UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
    return;
  }

@@ -89,8 +96,8 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
  auto const& h_node_idx = nidx;
  auto const& h_node_ptr = nptr;
  CHECK_LE(h_node_ptr.back(), info.num_row_);
-  auto h_predt = linalg::MakeTensorView(predt.ConstHostSpan(),
-                                        {info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
+  auto h_predt = linalg::MakeTensorView(ctx, predt.ConstHostSpan(), info.num_row_,
+                                        predt.Size() / info.num_row_);

  // loop over each leaf
  common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
@@ -99,8 +106,8 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
    CHECK_LT(k + 1, h_node_ptr.size());
    size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
    auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
-    CHECK_LE(group_idx, info.labels.Shape(1));
-    auto h_labels = info.labels.HostView().Slice(linalg::All(), group_idx);
+
+    auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
    auto h_weights = linalg::MakeVec(&info.weights_);

    auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
@@ -114,9 +121,9 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit

    float q{0};
    if (info.weights_.Empty()) {
-      q = common::Quantile(alpha, iter, iter + h_row_set.size());
+      q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
    } else {
-      q = common::WeightedQuantile(alpha, iter, iter + h_row_set.size(), w_it);
+      q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
    }
    if (std::isnan(q)) {
      CHECK(h_row_set.empty());
@@ -124,8 +131,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
    quantiles.at(k) = q;
  });

-  UpdateLeafValues(&quantiles, nidx, p_tree);
+  UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
 }
-}  // namespace detail
-}  // namespace obj
-}  // namespace xgboost
+
+#if !defined(XGBOOST_USE_CUDA)
+void UpdateTreeLeafDevice(Context const*, common::Span<bst_node_t const>, std::int32_t,
+                          MetaInfo const&, float, HostDeviceVector<float> const&, float, RegTree*) {
+  common::AssertGPUSupport();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::obj::detail
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -3,8 +3,8 @@
 */
 #include <thrust/sort.h>

-#include <cstdint>  // std::int32_t
-#include <cub/cub.cuh>
+#include <cstdint>                     // std::int32_t
+#include <cub/cub.cuh>                 // NOLINT

 #include "../common/cuda_context.cuh"  // CUDAContext
 #include "../common/device_helpers.cuh"
@@ -20,20 +20,19 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                          HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
  // copy position to buffer
  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  auto cuctx = ctx->CUDACtx();
  size_t n_samples = position.size();
-  dh::XGBDeviceAllocator<char> alloc;
  dh::device_vector<bst_node_t> sorted_position(position.size());
  dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
-                                position.size_bytes(), cudaMemcpyDeviceToDevice));
+                                position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));

  p_ridx->resize(position.size());
  dh::Iota(dh::ToSpan(*p_ridx));
  // sort row index according to node index
-  thrust::stable_sort_by_key(thrust::cuda::par(alloc), sorted_position.begin(),
+  thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(),
                             sorted_position.begin() + n_samples, p_ridx->begin());
-  dh::XGBCachingDeviceAllocator<char> caching;
  size_t beg_pos =
-      thrust::find_if(thrust::cuda::par(caching), sorted_position.cbegin(), sorted_position.cend(),
+      thrust::find_if(cuctx->CTP(), sorted_position.cbegin(), sorted_position.cend(),
                      [] XGBOOST_DEVICE(bst_node_t nidx) { return nidx >= 0; }) -
      sorted_position.cbegin();
  if (beg_pos == sorted_position.size()) {
@@ -72,7 +71,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
  size_t* h_num_runs = reinterpret_cast<size_t*>(pinned.subspan(0, sizeof(size_t)).data());

  dh::CUDAEvent e;
-  e.Record(dh::DefaultStream());
+  e.Record(cuctx->Stream());
  copy_stream.View().Wait(e);
  // flag for whether there's ignored position
  bst_node_t* h_first_unique =
@@ -108,7 +107,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
      d_node_ptr[0] = beg_pos;
    }
  });
-  thrust::inclusive_scan(thrust::cuda::par(caching), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
+  thrust::inclusive_scan(cuctx->CTP(), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
                         dh::tbegin(d_node_ptr));
  copy_stream.View().Sync();
  CHECK_GT(*h_num_runs, 0);
@@ -141,7 +140,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 }

 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info,
+                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                          HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
  dh::device_vector<size_t> ridx;
@@ -152,17 +151,17 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos

  if (nptr.Empty()) {
    std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), p_tree);
+    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), learning_rate, p_tree);
  }

  HostDeviceVector<float> quantiles;
  predt.SetDevice(ctx->gpu_id);

-  auto d_predt = linalg::MakeTensorView(predt.ConstDeviceSpan(),
-                                        {info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
+  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), info.num_row_,
+                                        predt.Size() / info.num_row_);
  CHECK_LT(group_idx, d_predt.Shape(1));
  auto t_predt = d_predt.Slice(linalg::All(), group_idx);
-  auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), group_idx);
+  auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx));

  auto d_row_index = dh::ToSpan(ridx);
  auto seg_beg = nptr.DevicePointer();
@@ -187,7 +186,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                      w_it + d_weights.size(), &quantiles);
  }

-  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), p_tree);
+  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), learning_rate, p_tree);
 }
 }  // namespace detail
 }  // namespace obj
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -6,13 +6,15 @@
 #include <algorithm>
 #include <cstdint>  // std::int32_t
 #include <limits>
-#include <vector>
+#include <vector>  // std::vector

 #include "../collective/communicator-inl.h"
 #include "../common/common.h"
-#include "xgboost/context.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/tree_model.h"
+#include "xgboost/base.h"                // bst_node_t
+#include "xgboost/context.h"             // Context
+#include "xgboost/data.h"                // MetaInfo
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/tree_model.h"          // RegTree

 namespace xgboost {
 namespace obj {
@@ -34,7 +36,7 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
 }

 inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
-                             RegTree* p_tree) {
+                             float learning_rate, RegTree* p_tree) {
  auto& tree = *p_tree;
  auto& quantiles = *p_quantiles;
  auto const& h_node_idx = nidx;
@@ -69,17 +71,39 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
    auto nidx = h_node_idx[i];
    auto q = quantiles[i];
    CHECK(tree[nidx].IsLeaf());
-    tree[nidx].SetLeaf(q);
+    tree[nidx].SetLeaf(q * learning_rate);
  }
 }

+inline std::size_t IdxY(MetaInfo const& info, bst_group_t group_idx) {
+  std::size_t y_idx{0};
+  if (info.labels.Shape(1) > 1) {
+    y_idx = group_idx;
+  }
+  CHECK_LE(y_idx, info.labels.Shape(1));
+  return y_idx;
+}
+
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info,
+                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                          HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);

 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info,
+                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                        HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
 }  // namespace detail
+
+inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
+                           std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
+  if (ctx->IsCPU()) {
+    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
+                               predt, alpha, p_tree);
+  } else {
+    position.SetDevice(ctx->gpu_id);
+    detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
+                                 predt, alpha, p_tree);
+  }
+}
 }  // namespace obj
 }  // namespace xgboost
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2022-2023 by XGBoost contributors
+ */
+#include "init_estimation.h"
+
+#include <memory>                        // unique_ptr
+
+#include "../common/stats.h"             // Mean
+#include "../tree/fit_stump.h"           // FitStump
+#include "xgboost/base.h"                // GradientPair
+#include "xgboost/data.h"                // MetaInfo
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/json.h"                // Json
+#include "xgboost/linalg.h"              // Tensor,Vector
+#include "xgboost/task.h"                // ObjInfo
+
+namespace xgboost {
+namespace obj {
+void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const {
+  if (this->Task().task == ObjInfo::kRegression) {
+    CheckInitInputs(info);
+  }
+  // Avoid altering any state in child objective.
+  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
+  HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
+
+  Json config{Object{}};
+  this->SaveConfig(&config);
+
+  std::unique_ptr<ObjFunction> new_obj{
+      ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
+  new_obj->LoadConfig(config);
+  new_obj->GetGradient(dummy_predt, info, 0, &gpair);
+  bst_target_t n_targets = this->Targets(info);
+  linalg::Vector<float> leaf_weight;
+  tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
+
+  // workaround, we don't support multi-target due to binary model serialization for
+  // base margin.
+  common::Mean(this->ctx_, leaf_weight, base_score);
+  this->PredTransform(base_score->Data());
+}
+}  // namespace obj
+}  // namespace xgboost
--- a/src/objective/init_estimation.h
+++ b/src/objective/init_estimation.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2022-2023 by XGBoost contributors
+ */
+#ifndef XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
+#define XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
+#include "xgboost/data.h"       // MetaInfo
+#include "xgboost/linalg.h"     // Tensor
+#include "xgboost/objective.h"  // ObjFunction
+
+namespace xgboost {
+namespace obj {
+class FitIntercept : public ObjFunction {
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override;
+};
+
+inline void CheckInitInputs(MetaInfo const& info) {
+  CHECK_EQ(info.labels.Shape(0), info.num_row_) << "Invalid shape of labels.";
+  if (!info.weights_.Empty()) {
+    CHECK_EQ(info.weights_.Size(), info.num_row_)
+        << "Number of weights should be equal to number of data points.";
+  }
+}
+}  // namespace obj
+}  // namespace xgboost
+#endif  // XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -44,11 +44,13 @@ namespace obj {
 // List of files that will be force linked in static links.
 #ifdef XGBOOST_USE_CUDA
 DMLC_REGISTRY_LINK_TAG(regression_obj_gpu);
+DMLC_REGISTRY_LINK_TAG(quantile_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(hinge_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(rank_obj_gpu);
 #else
 DMLC_REGISTRY_LINK_TAG(regression_obj);
+DMLC_REGISTRY_LINK_TAG(quantile_obj);
 DMLC_REGISTRY_LINK_TAG(hinge_obj);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj);
 DMLC_REGISTRY_LINK_TAG(rank_obj);
--- a/src/objective/quantile_obj.cc
+++ b/src/objective/quantile_obj.cc
@@ -0,0 +1,18 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+
+// Dummy file to enable the CUDA conditional compile trick.
+
+#include <dmlc/registry.h>
+namespace xgboost {
+namespace obj {
+
+DMLC_REGISTRY_FILE_TAG(quantile_obj);
+
+}  // namespace obj
+}  // namespace xgboost
+
+#ifndef XGBOOST_USE_CUDA
+#include "quantile_obj.cu"
+#endif  // !defined(XBGOOST_USE_CUDA)
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -0,0 +1,222 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ */
+#include <cstddef>                          // std::size_t
+#include <cstdint>                          // std::int32_t
+#include <vector>                           // std::vector
+
+#include "../common/linalg_op.h"            // ElementWiseKernel,cbegin,cend
+#include "../common/quantile_loss_utils.h"  // QuantileLossParam
+#include "../common/stats.h"                // Quantile,WeightedQuantile
+#include "adaptive.h"                       // UpdateTreeLeaf
+#include "dmlc/parameter.h"                 // DMLC_DECLARE_PARAMETER
+#include "init_estimation.h"                // CheckInitInputs
+#include "xgboost/base.h"                   // GradientPair,XGBOOST_DEVICE,bst_target_t
+#include "xgboost/data.h"                   // MetaInfo
+#include "xgboost/host_device_vector.h"     // HostDeviceVector
+#include "xgboost/json.h"                   // Json,String,ToJson,FromJson
+#include "xgboost/linalg.h"                 // Tensor,MakeTensorView,MakeVec
+#include "xgboost/objective.h"              // ObjFunction
+#include "xgboost/parameter.h"              // XGBoostParameter
+
+#if defined(XGBOOST_USE_CUDA)
+
+#include "../common/linalg_op.cuh"  // ElementWiseKernel
+#include "../common/stats.cuh"      // SegmentedQuantile
+
+#endif                              // defined(XGBOOST_USE_CUDA)
+
+namespace xgboost {
+namespace obj {
+class QuantileRegression : public ObjFunction {
+  common::QuantileLossParam param_;
+  HostDeviceVector<float> alpha_;
+
+  bst_target_t Targets(MetaInfo const& info) const override {
+    auto const& alpha = param_.quantile_alpha.Get();
+    CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
+    CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target is not yet supported by the quantile loss.";
+    CHECK(!alpha.empty());
+    // We have some placeholders for multi-target in the quantile loss. But it's not
+    // supported as the gbtree doesn't know how to slice the gradient and there's no 3-dim
+    // model shape in general.
+    auto n_y = std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
+    return alpha_.Size() * n_y;
+  }
+
+ public:
+  void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info, std::int32_t iter,
+                   HostDeviceVector<GradientPair>* out_gpair) override {
+    if (iter == 0) {
+      CheckInitInputs(info);
+    }
+    CHECK_EQ(param_.quantile_alpha.Get().size(), alpha_.Size());
+
+    using SizeT = decltype(info.num_row_);
+    SizeT n_targets = this->Targets(info);
+    SizeT n_alphas = alpha_.Size();
+    CHECK_NE(n_alphas, 0);
+    CHECK_GE(n_targets, n_alphas);
+    CHECK_EQ(preds.Size(), info.num_row_ * n_targets);
+
+    auto labels = info.labels.View(ctx_->gpu_id);
+
+    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->Resize(n_targets * info.num_row_);
+    auto gpair =
+        linalg::MakeTensorView(ctx_, out_gpair, info.num_row_, n_alphas, n_targets / n_alphas);
+
+    info.weights_.SetDevice(ctx_->gpu_id);
+    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
+                                                 : info.weights_.ConstDeviceSpan()};
+
+    preds.SetDevice(ctx_->gpu_id);
+    auto predt = linalg::MakeVec(&preds);
+    auto n_samples = info.num_row_;
+
+    alpha_.SetDevice(ctx_->gpu_id);
+    auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
+
+    linalg::ElementWiseKernel(
+        ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
+          auto [sample_id, quantile_id, target_id] =
+              linalg::UnravelIndex(i, n_samples, alpha.size(), n_targets / alpha.size());
+
+          auto d = predt(i) - labels(sample_id, target_id);
+          auto h = weight[sample_id];
+          if (d >= 0) {
+            auto g = (1.0f - alpha[quantile_id]) * weight[sample_id];
+            gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
+          } else {
+            auto g = (-alpha[quantile_id] * weight[sample_id]);
+            gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
+          }
+        });
+  }
+
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override {
+    CHECK(!alpha_.Empty());
+
+    auto n_targets = this->Targets(info);
+    base_score->SetDevice(ctx_->gpu_id);
+    base_score->Reshape(n_targets);
+
+    double sw{0};
+    if (ctx_->IsCPU()) {
+      auto quantiles = base_score->HostView();
+      auto h_weights = info.weights_.ConstHostVector();
+      if (info.weights_.Empty()) {
+        sw = info.num_row_;
+      } else {
+        sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0);
+      }
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto alpha = param_.quantile_alpha[t];
+        auto h_labels = info.labels.HostView();
+        if (h_weights.empty()) {
+          quantiles(t) =
+              common::Quantile(ctx_, alpha, linalg::cbegin(h_labels), linalg::cend(h_labels));
+        } else {
+          CHECK_EQ(h_weights.size(), h_labels.Size());
+          quantiles(t) = common::WeightedQuantile(ctx_, alpha, linalg::cbegin(h_labels),
+                                                  linalg::cend(h_labels), std::cbegin(h_weights));
+        }
+      }
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      alpha_.SetDevice(ctx_->gpu_id);
+      auto d_alpha = alpha_.ConstDeviceSpan();
+      auto d_labels = info.labels.View(ctx_->gpu_id);
+      auto seg_it = dh::MakeTransformIterator<std::size_t>(
+          thrust::make_counting_iterator(0ul),
+          [=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
+      CHECK_EQ(d_labels.Shape(1), 1);
+      auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                     [=] XGBOOST_DEVICE(std::size_t i) {
+                                                       auto sample_idx = i % d_labels.Shape(0);
+                                                       return d_labels(sample_idx, 0);
+                                                     });
+      auto n = d_labels.Size() * d_alpha.size();
+      CHECK_EQ(base_score->Size(), d_alpha.size());
+      if (info.weights_.Empty()) {
+        common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
+                                  val_it + n, base_score->Data());
+        sw = info.num_row_;
+      } else {
+        info.weights_.SetDevice(ctx_->gpu_id);
+        auto d_weights = info.weights_.ConstDeviceSpan();
+        auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                          [=] XGBOOST_DEVICE(std::size_t i) {
+                                                            auto sample_idx = i % d_labels.Shape(0);
+                                                            return d_weights[sample_idx];
+                                                          });
+        common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
+                                          val_it, val_it + n, weight_it, weight_it + n,
+                                          base_score->Data());
+        sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
+                        thrust::plus<double>{});
+      }
+#else
+      common::AssertGPUSupport();
+#endif  // defined(XGBOOST_USE_CUDA)
+    }
+
+    // For multiple quantiles, we should extend the base score to a vector instead of
+    // computing the average. For now, this is a workaround.
+    linalg::Vector<float> temp;
+    common::Mean(ctx_, *base_score, &temp);
+    double meanq = temp(0) * sw;
+
+    collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
+    collective::Allreduce<collective::Operation::kSum>(&sw, 1);
+    meanq /= (sw + kRtEps);
+    base_score->Reshape(1);
+    base_score->Data()->Fill(meanq);
+  }
+
+  void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
+                      float learning_rate, HostDeviceVector<float> const& prediction,
+                      std::int32_t group_idx, RegTree* p_tree) const override {
+    auto alpha = param_.quantile_alpha[group_idx];
+    ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction,
+                                   alpha, p_tree);
+  }
+
+  void Configure(Args const& args) override {
+    param_.UpdateAllowUnknown(args);
+    param_.Validate();
+    this->alpha_.HostVector() = param_.quantile_alpha.Get();
+  }
+  ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
+  static char const* Name() { return "reg:quantileerror"; }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String(Name());
+    out["quantile_loss_param"] = ToJson(param_);
+  }
+  void LoadConfig(Json const& in) override {
+    CHECK_EQ(get<String const>(in["name"]), Name());
+    FromJson(in["quantile_loss_param"], &param_);
+    alpha_.HostVector() = param_.quantile_alpha.Get();
+  }
+
+  const char* DefaultEvalMetric() const override { return "quantile"; }
+  Json DefaultMetricConfig() const override {
+    CHECK(param_.GetInitialised());
+    Json config{Object{}};
+    config["name"] = String{this->DefaultEvalMetric()};
+    config["quantile_loss_param"] = ToJson(param_);
+    return config;
+  }
+};
+
+XGBOOST_REGISTER_OBJECTIVE(QuantileRegression, QuantileRegression::Name())
+    .describe("Regression with quantile loss.")
+    .set_body([]() { return new QuantileRegression(); });
+
+#if defined(XGBOOST_USE_CUDA)
+DMLC_REGISTRY_FILE_TAG(quantile_obj_gpu);
+#endif  // defined(XGBOOST_USE_CUDA)
+}  // namespace obj
+}  // namespace xgboost
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@@ -1,15 +1,16 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2023 by XGBoost contributors
 */
 #ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 #define XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_

 #include <dmlc/omp.h>
-#include <xgboost/logging.h>

 #include <cmath>

 #include "../common/math.h"
+#include "xgboost/data.h"  // MetaInfo
+#include "xgboost/logging.h"
 #include "xgboost/task.h"  // ObjInfo

 namespace xgboost {
@@ -105,7 +106,6 @@ struct LogisticRaw : public LogisticRegression {

  static ObjInfo Info() { return ObjInfo::kRegression; }
 };
-
 }  // namespace obj
 }  // namespace xgboost

--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -20,12 +20,12 @@
 #include "../common/stats.h"
 #include "../common/threading_utils.h"
 #include "../common/transform.h"
-#include "../tree/fit_stump.h"  // FitStump
 #include "./regression_loss.h"
 #include "adaptive.h"
+#include "init_estimation.h"  // FitIntercept
 #include "xgboost/base.h"
-#include "xgboost/context.h"
-#include "xgboost/data.h"  // MetaInfo
+#include "xgboost/context.h"  // Context
+#include "xgboost/data.h"     // MetaInfo
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
 #include "xgboost/linalg.h"
@@ -43,45 +43,12 @@
 namespace xgboost {
 namespace obj {
 namespace {
-void CheckInitInputs(MetaInfo const& info) {
-  CHECK_EQ(info.labels.Shape(0), info.num_row_) << "Invalid shape of labels.";
-  if (!info.weights_.Empty()) {
-    CHECK_EQ(info.weights_.Size(), info.num_row_)
-        << "Number of weights should be equal to number of data points.";
-  }
-}
-
 void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& preds) {
  CheckInitInputs(info);
  CHECK_EQ(info.labels.Size(), preds.Size()) << "Invalid shape of labels.";
 }
 }  // anonymous namespace

-class RegInitEstimation : public ObjFunction {
-  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) const override {
-    CheckInitInputs(info);
-    // Avoid altering any state in child objective.
-    HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-    HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
-
-    Json config{Object{}};
-    this->SaveConfig(&config);
-
-    std::unique_ptr<ObjFunction> new_obj{
-        ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
-    new_obj->LoadConfig(config);
-    new_obj->GetGradient(dummy_predt, info, 0, &gpair);
-    bst_target_t n_targets = this->Targets(info);
-    linalg::Vector<float> leaf_weight;
-    tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
-
-    // workaround, we don't support multi-target due to binary model serialization for
-    // base margin.
-    common::Mean(this->ctx_, leaf_weight, base_score);
-    this->PredTransform(base_score->Data());
-  }
-};
-
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(regression_obj_gpu);
 #endif  // defined(XGBOOST_USE_CUDA)
@@ -96,7 +63,7 @@ struct RegLossParam : public XGBoostParameter<RegLossParam> {
 };

 template<typename Loss>
-class RegLossObj : public RegInitEstimation {
+class RegLossObj : public FitIntercept {
 protected:
  HostDeviceVector<float> additional_input_;

@@ -243,7 +210,7 @@ XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
    return new RegLossObj<LinearSquareLoss>(); });
 // End deprecated

-class PseudoHuberRegression : public RegInitEstimation {
+class PseudoHuberRegression : public FitIntercept {
  PesudoHuberParam param_;

 public:
@@ -318,7 +285,7 @@ struct PoissonRegressionParam : public XGBoostParameter<PoissonRegressionParam>
 };

 // poisson regression for count
-class PoissonRegression : public RegInitEstimation {
+class PoissonRegression : public FitIntercept {
 public:
  // declare functions
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
@@ -413,7 +380,7 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")


 // cox regression for survival data (negative values mean they are censored)
-class CoxRegression : public RegInitEstimation {
+class CoxRegression : public FitIntercept {
 public:
  void Configure(Args const&) override {}
  ObjInfo Task() const override { return ObjInfo::kRegression; }
@@ -426,7 +393,7 @@ class CoxRegression : public RegInitEstimation {
    const auto& preds_h = preds.HostVector();
    out_gpair->Resize(preds_h.size());
    auto& gpair = out_gpair->HostVector();
-    const std::vector<size_t> &label_order = info.LabelAbsSort();
+    const std::vector<size_t> &label_order = info.LabelAbsSort(ctx_);

    const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size()); // NOLINT(*)
    const bool is_null_weight = info.weights_.Size() == 0;
@@ -510,7 +477,7 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
 .set_body([]() { return new CoxRegression(); });

 // gamma regression
-class GammaRegression : public RegInitEstimation {
+class GammaRegression : public FitIntercept {
 public:
  void Configure(Args const&) override {}
  ObjInfo Task() const override { return ObjInfo::kRegression; }
@@ -601,7 +568,7 @@ struct TweedieRegressionParam : public XGBoostParameter<TweedieRegressionParam>
 };

 // tweedie regression
-class TweedieRegression : public RegInitEstimation {
+class TweedieRegression : public FitIntercept {
 public:
  // declare functions
  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
@@ -775,20 +742,10 @@ class MeanAbsoluteError : public ObjFunction {
  }

  void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
-                      HostDeviceVector<float> const& prediction, std::int32_t group_idx,
-                      RegTree* p_tree) const override {
-    if (ctx_->IsCPU()) {
-      auto const& h_position = position.ConstHostVector();
-      detail::UpdateTreeLeafHost(ctx_, h_position, group_idx, info, prediction, 0.5, p_tree);
-    } else {
-#if defined(XGBOOST_USE_CUDA)
-      position.SetDevice(ctx_->gpu_id);
-      auto d_position = position.ConstDeviceSpan();
-      detail::UpdateTreeLeafDevice(ctx_, d_position, group_idx, info, prediction, 0.5, p_tree);
-#else
-      common::AssertGPUSupport();
-#endif  //  defined(XGBOOST_USE_CUDA)
-    }
+                      float learning_rate, HostDeviceVector<float> const& prediction,
+                      std::int32_t group_idx, RegTree* p_tree) const override {
+    ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction, 0.5,
+                                   p_tree);
  }

  const char* DefaultEvalMetric() const override { return "mae"; }
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -164,7 +164,7 @@ struct GHistIndexMatrixView {
  SparsePage::Inst operator[](size_t r) {
    auto t = omp_get_thread_num();
    auto const beg = (n_features_ * kUnroll * t) + (current_unroll_[t] * n_features_);
-    size_t non_missing{beg};
+    size_t non_missing{static_cast<std::size_t>(beg)};

    for (bst_feature_t c = 0; c < n_features_; ++c) {
      float f = page_.GetFvalue(r, c, common::IsCat(ft_, c));
@@ -477,7 +477,8 @@ class ColumnSplitHelper {
    // auto block_id has the same type as `n_blocks`.
    common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
      auto const batch_offset = block_id * block_of_rows_size;
-      auto const block_size = std::min(nsize - batch_offset, block_of_rows_size);
+      auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
+                                       static_cast<std::size_t>(block_of_rows_size));
      auto const fvec_offset = omp_get_thread_num() * block_of_rows_size;

      FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, &feat_vecs_);
@@ -490,7 +491,8 @@ class ColumnSplitHelper {
    // auto block_id has the same type as `n_blocks`.
    common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
      auto const batch_offset = block_id * block_of_rows_size;
-      auto const block_size = std::min(nsize - batch_offset, block_of_rows_size);
+      auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
+                                       static_cast<std::size_t>(block_of_rows_size));
      PredictAllTrees(out_preds, batch_offset, batch_offset + batch.base_rowid, num_group,
                      block_size);
    });
@@ -584,7 +586,7 @@ class CPUPredictor : public Predictor {

  void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
                      gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end) const {
-    if (p_fmat->Info().data_split_mode == DataSplitMode::kCol) {
+    if (p_fmat->IsColumnSplit()) {
      ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
      helper.PredictDMatrix(p_fmat, out_preds);
      return;
--- a/src/predictor/cpu_treeshap.cc
+++ b/src/predictor/cpu_treeshap.cc
@@ -3,10 +3,11 @@
 */
 #include "cpu_treeshap.h"

-#include <cinttypes>  // std::uint32_t
+#include <algorithm>             // copy
+#include <cinttypes>             // std::uint32_t

-#include "predict_fn.h"    // GetNextNode
-#include "xgboost/base.h"  // bst_node_t
+#include "predict_fn.h"          // GetNextNode
+#include "xgboost/base.h"        // bst_node_t
 #include "xgboost/logging.h"
 #include "xgboost/tree_model.h"  // RegTree

--- a/src/predictor/cpu_treeshap.h
+++ b/src/predictor/cpu_treeshap.h
@@ -1,6 +1,10 @@
+#ifndef XGBOOST_PREDICTOR_CPU_TREESHAP_H_
+#define XGBOOST_PREDICTOR_CPU_TREESHAP_H_
 /**
 * Copyright by XGBoost Contributors 2017-2022
 */
+#include <vector>                // vector
+
 #include "xgboost/tree_model.h"  // RegTree

 namespace xgboost {
@@ -15,3 +19,4 @@ void CalculateContributions(RegTree const &tree, const RegTree::FVec &feat,
                            std::vector<float> *mean_values, bst_float *out_contribs, int condition,
                            unsigned condition_feature);
 }  // namespace xgboost
+#endif  // XGBOOST_PREDICTOR_CPU_TREESHAP_H_
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -9,6 +9,7 @@
 #include <limits>  // std::numeric_limits
 #include <vector>

+#include "../collective/communicator-inl.h"
 #include "../common/numeric.h"  // Iota
 #include "../common/partition_builder.h"
 #include "hist/expand_entry.h"  // CPUExpandEntry
@@ -16,17 +17,73 @@

 namespace xgboost {
 namespace tree {
-class CommonRowPartitioner {
-  static constexpr size_t kPartitionBlockSize = 2048;
-  common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
-  common::RowSetCollection row_set_collection_;

+static constexpr size_t kPartitionBlockSize = 2048;
+
+class ColumnSplitHelper {
+ public:
+  ColumnSplitHelper() = default;
+
+  ColumnSplitHelper(bst_row_t num_row,
+                    common::PartitionBuilder<kPartitionBlockSize>* partition_builder,
+                    common::RowSetCollection* row_set_collection)
+      : partition_builder_{partition_builder}, row_set_collection_{row_set_collection} {
+    decision_storage_.resize(num_row);
+    decision_bits_ = BitVector(common::Span<BitVector::value_type>(decision_storage_));
+    missing_storage_.resize(num_row);
+    missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
+  }
+
+  void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
+                 GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
+                 std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+    // When data is split by column, we don't have all the feature values in the local worker, so
+    // we first collect all the decisions and whether the feature is missing into bit vectors.
+    std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
+    std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
+    common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
+      const int32_t nid = nodes[node_in_set].nid;
+      partition_builder_->MaskRows(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
+                                   (*row_set_collection_)[nid].begin, &decision_bits_,
+                                   &missing_bits_);
+    });
+
+    // Then aggregate the bit vectors across all the workers.
+    collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
+                                                             decision_storage_.size());
+    collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
+                                                              missing_storage_.size());
+
+    // Finally use the bit vectors to partition the rows.
+    common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
+      size_t begin = r.begin();
+      const int32_t nid = nodes[node_in_set].nid;
+      const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
+      partition_builder_->AllocateForTask(task_id);
+      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
+                                          (*row_set_collection_)[nid].begin, decision_bits_,
+                                          missing_bits_);
+    });
+  }
+
+ private:
+  using BitVector = RBitField8;
+  std::vector<BitVector::value_type> decision_storage_{};
+  BitVector decision_bits_{};
+  std::vector<BitVector::value_type> missing_storage_{};
+  BitVector missing_bits_{};
+  common::PartitionBuilder<kPartitionBlockSize>* partition_builder_;
+  common::RowSetCollection* row_set_collection_;
+};
+
+class CommonRowPartitioner {
 public:
  bst_row_t base_rowid = 0;

  CommonRowPartitioner() = default;
-  CommonRowPartitioner(Context const* ctx, bst_row_t num_row, bst_row_t _base_rowid)
-      : base_rowid{_base_rowid} {
+  CommonRowPartitioner(Context const* ctx, bst_row_t num_row, bst_row_t _base_rowid,
+                       bool is_col_split)
+      : base_rowid{_base_rowid}, is_col_split_{is_col_split} {
    row_set_collection_.Clear();
    std::vector<size_t>& row_indices = *row_set_collection_.Data();
    row_indices.resize(num_row);
@@ -34,6 +91,10 @@ class CommonRowPartitioner {
    std::size_t* p_row_indices = row_indices.data();
    common::Iota(ctx, p_row_indices, p_row_indices + row_indices.size(), base_rowid);
    row_set_collection_.Init();
+
+    if (is_col_split_) {
+      column_split_helper_ = ColumnSplitHelper{num_row, &partition_builder_, &row_set_collection_};
+    }
  }

  void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
@@ -156,16 +217,20 @@ class CommonRowPartitioner {

    // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
    // Store results in intermediate buffers from partition_builder_
-    common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
-      size_t begin = r.begin();
-      const int32_t nid = nodes[node_in_set].nid;
-      const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin);
-      partition_builder_.AllocateForTask(task_id);
-      bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
-      partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
-          node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
-          row_set_collection_[nid].begin);
-    });
+    if (is_col_split_) {
+      column_split_helper_.Partition(space, ctx->Threads(), gmat, column_matrix, nodes, p_tree);
+    } else {
+      common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
+        size_t begin = r.begin();
+        const int32_t nid = nodes[node_in_set].nid;
+        const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin);
+        partition_builder_.AllocateForTask(task_id);
+        bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
+        partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
+            node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
+            row_set_collection_[nid].begin);
+      });
+    }

    // 3. Compute offsets to copy blocks of row-indexes
    // from partition_builder_ to row_set_collection_
@@ -205,6 +270,12 @@ class CommonRowPartitioner {
        ctx, tree, this->Partitions(), p_out_position,
        [&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
  }
+
+ private:
+  common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
+  common::RowSetCollection row_set_collection_;
+  bool is_col_split_;
+  ColumnSplitHelper column_split_helper_;
 };

 }  // namespace tree
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -97,7 +97,7 @@ class EvaluateSplitAgent {
         idx += kBlockSize) {
      local_sum += LoadGpair(node_histogram + idx);
    }
-    local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);
+    local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);  // NOLINT
    // Broadcast result from thread 0
    return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
            __shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
@@ -359,8 +359,8 @@ void GPUHistEvaluator::LaunchEvaluateSplits(

  // One block for each feature
  uint32_t constexpr kBlockThreads = 32;
-  dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
-                   0}(
+  dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads,
+                    0}(
      EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
      shared_inputs,
      this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,15 +1,15 @@
-/*!
- * Copyright 2020-2021 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
 */
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>

 #include <algorithm>
-#include <ctgmath>
+#include <cstdint>  // uint32_t
 #include <limits>

-#include "../../common/device_helpers.cuh"
 #include "../../common/deterministic.cuh"
+#include "../../common/device_helpers.cuh"
 #include "../../data/ellpack_page.cuh"
 #include "histogram.cuh"
 #include "row_partitioner.cuh"
@@ -83,7 +83,8 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
   */
  to_floating_point_ =
      histogram_rounding /
-      T(IntT(1) << (sizeof(typename GradientSumT::ValueT) * 8 - 2));  // keep 1 for sign bit
+      static_cast<T>(static_cast<IntT>(1)
+                     << (sizeof(typename GradientSumT::ValueT) * 8 - 2));  // keep 1 for sign bit
  /**
   * Factor for converting gradients from floating-point to fixed-point. For
   * f64:
@@ -93,8 +94,8 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
   * rounding is calcuated as exp(m), see the rounding factor calcuation for
   * details.
   */
-  to_fixed_point_ =
-      GradientSumT(T(1) / to_floating_point_.GetGrad(), T(1) / to_floating_point_.GetHess());
+  to_fixed_point_ = GradientSumT(static_cast<T>(1) / to_floating_point_.GetGrad(),
+                                 static_cast<T>(1) / to_floating_point_.GetHess());
 }


@@ -153,7 +154,8 @@ class HistogramAgent {
        d_gpair_(d_gpair) {}
  __device__ void ProcessPartialTileShared(std::size_t offset) {
    for (std::size_t idx = offset + threadIdx.x;
-         idx < min(offset + kBlockThreads * kItemsPerTile, n_elements_); idx += kBlockThreads) {
+         idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
+         idx += kBlockThreads) {
      int ridx = d_ridx_[idx / feature_stride_];
      int gidx =
          matrix_
@@ -295,11 +297,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&

    // Allocate number of blocks such that each block has about kMinItemsPerBlock work
    // Up to a maximum where the device is saturated
-    grid_size =
-        min(grid_size,
-            unsigned(common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
+    grid_size = std::min(grid_size, static_cast<std::uint32_t>(
+                                        common::DivRoundUp(items_per_group, kMinItemsPerBlock)));

-    dh::LaunchKernel{dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
+    dh::LaunchKernel {dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
                     ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),
                                     gpair.data(), rounding);
  };
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -130,7 +130,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
        std::size_t item_idx;
        AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
        auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
-        return IndexFlagTuple{bst_uint(item_idx), op_res, batch_idx, op_res};
+        return IndexFlagTuple{static_cast<bst_uint>(item_idx), op_res, batch_idx, op_res};
      });
  size_t temp_bytes = 0;
  if (tmp->empty()) {
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -1,10 +1,11 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_

 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <limits>
 #include <memory>
 #include <numeric>
@@ -16,13 +17,11 @@
 #include "../../common/random.h"
 #include "../../data/gradient_index.h"
 #include "../constraints.h"
-#include "../param.h"
+#include "../param.h"  // for TrainParam
 #include "../split_evaluator.h"
 #include "xgboost/context.h"

-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 template <typename ExpandEntry>
 class HistEvaluator {
 private:
@@ -34,10 +33,11 @@ class HistEvaluator {
  };

 private:
-  TrainParam param_;
+  Context const* ctx_;
+  TrainParam const* param_;
  std::shared_ptr<common::ColumnSampler> column_sampler_;
  TreeEvaluator tree_evaluator_;
-  int32_t n_threads_ {0};
+  bool is_col_split_{false};
  FeatureInteractionConstraintHost interaction_constraints_;
  std::vector<NodeEntry> snode_;

@@ -53,8 +53,9 @@ class HistEvaluator {
    }
  }

-  bool IsValid(GradStats const &left, GradStats const &right) const {
-    return left.GetHess() >= param_.min_child_weight && right.GetHess() >= param_.min_child_weight;
+  [[nodiscard]] bool IsValid(GradStats const &left, GradStats const &right) const {
+    return left.GetHess() >= param_->min_child_weight &&
+           right.GetHess() >= param_->min_child_weight;
  }

  /**
@@ -93,9 +94,10 @@ class HistEvaluator {
      right_sum = GradStats{hist[i]};
      left_sum.SetSubstract(parent.stats, right_sum);
      if (IsValid(left_sum, right_sum)) {
-        auto missing_left_chg = static_cast<float>(
-            evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
-            parent.root_gain);
+        auto missing_left_chg =
+            static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
+                                                       GradStats{right_sum}) -
+                               parent.root_gain);
        best.Update(missing_left_chg, fidx, split_pt, true, true, left_sum, right_sum);
      }

@@ -103,9 +105,10 @@ class HistEvaluator {
      right_sum.Add(missing);
      left_sum.SetSubstract(parent.stats, right_sum);
      if (IsValid(left_sum, right_sum)) {
-        auto missing_right_chg = static_cast<float>(
-            evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
-            parent.root_gain);
+        auto missing_right_chg =
+            static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
+                                                       GradStats{right_sum}) -
+                               parent.root_gain);
        best.Update(missing_right_chg, fidx, split_pt, false, true, left_sum, right_sum);
      }
    }
@@ -150,7 +153,7 @@ class HistEvaluator {
    bst_bin_t f_begin = cut_ptr[fidx];
    bst_bin_t f_end = cut_ptr[fidx + 1];
    bst_bin_t n_bins_feature{f_end - f_begin};
-    auto n_bins = std::min(param_.max_cat_threshold, n_bins_feature);
+    auto n_bins = std::min(param_->max_cat_threshold, n_bins_feature);

    // statistics on both sides of split
    GradStats left_sum;
@@ -179,9 +182,9 @@ class HistEvaluator {
        right_sum.SetSubstract(parent.stats, left_sum);  // missing on right
      }
      if (IsValid(left_sum, right_sum)) {
-        auto loss_chg =
-            evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
-            parent.root_gain;
+        auto loss_chg = evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
+                                                GradStats{right_sum}) -
+                        parent.root_gain;
        // We don't have a numeric split point, nan here is a dummy split.
        if (best.Update(loss_chg, fidx, std::numeric_limits<float>::quiet_NaN(), d_step == 1, true,
                        left_sum, right_sum)) {
@@ -254,7 +257,7 @@ class HistEvaluator {
        if (d_step > 0) {
          // forward enumeration: split at right bound of each bin
          loss_chg =
-              static_cast<float>(evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum},
+              static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
                                                         GradStats{right_sum}) -
                                 parent.root_gain);
          split_pt = cut_val[i];  // not used for partition based
@@ -262,7 +265,7 @@ class HistEvaluator {
        } else {
          // backward enumeration: split at left bound of each bin
          loss_chg =
-              static_cast<float>(evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{right_sum},
+              static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{right_sum},
                                                         GradStats{left_sum}) -
                                 parent.root_gain);
          if (i == imin) {
@@ -283,6 +286,7 @@ class HistEvaluator {
  void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
                      common::Span<FeatureType const> feature_types, const RegTree &tree,
                      std::vector<ExpandEntry> *p_entries) {
+    auto n_threads = ctx_->Threads();
    auto& entries = *p_entries;
    // All nodes are on the same level, so we can store the shared ptr.
    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(
@@ -294,23 +298,23 @@ class HistEvaluator {
    }
    CHECK(!features.empty());
    const size_t grain_size =
-        std::max<size_t>(1, features.front()->Size() / n_threads_);
+        std::max<size_t>(1, features.front()->Size() / n_threads);
    common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) {
      return features[nidx_in_set]->Size();
    }, grain_size);

-    std::vector<ExpandEntry> tloc_candidates(n_threads_ * entries.size());
+    std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
    for (size_t i = 0; i < entries.size(); ++i) {
-      for (decltype(n_threads_) j = 0; j < n_threads_; ++j) {
-        tloc_candidates[i * n_threads_ + j] = entries[i];
+      for (decltype(n_threads) j = 0; j < n_threads; ++j) {
+        tloc_candidates[i * n_threads + j] = entries[i];
      }
    }
    auto evaluator = tree_evaluator_.GetEvaluator();
    auto const& cut_ptrs = cut.Ptrs();

-    common::ParallelFor2d(space, n_threads_, [&](size_t nidx_in_set, common::Range1d r) {
+    common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
      auto tidx = omp_get_thread_num();
-      auto entry = &tloc_candidates[n_threads_ * nidx_in_set + tidx];
+      auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
      auto best = &entry->split;
      auto nidx = entry->nid;
      auto histogram = hist[nidx];
@@ -323,7 +327,7 @@ class HistEvaluator {
        }
        if (is_cat) {
          auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
-          if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) {
+          if (common::UseOneHot(n_bins, param_->max_cat_to_onehot)) {
            EnumerateOneHot(cut, histogram, fidx, nidx, evaluator, best);
          } else {
            std::vector<size_t> sorted_idx(n_bins);
@@ -331,8 +335,8 @@ class HistEvaluator {
            auto feat_hist = histogram.subspan(cut_ptrs[fidx], n_bins);
            // Sort the histogram to get contiguous partitions.
            std::stable_sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t l, size_t r) {
-              auto ret = evaluator.CalcWeightCat(param_, feat_hist[l]) <
-                         evaluator.CalcWeightCat(param_, feat_hist[r]);
+              auto ret = evaluator.CalcWeightCat(*param_, feat_hist[l]) <
+                         evaluator.CalcWeightCat(*param_, feat_hist[r]);
              return ret;
            });
            EnumeratePart<+1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
@@ -349,12 +353,29 @@ class HistEvaluator {

    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size();
         ++nidx_in_set) {
-      for (auto tidx = 0; tidx < n_threads_; ++tidx) {
+      for (auto tidx = 0; tidx < n_threads; ++tidx) {
        entries[nidx_in_set].split.Update(
-            tloc_candidates[n_threads_ * nidx_in_set + tidx].split);
+            tloc_candidates[n_threads * nidx_in_set + tidx].split);
+      }
+    }
+
+    if (is_col_split_) {
+      // With column-wise data split, we gather the best splits from all the workers and update the
+      // expand entries accordingly.
+      auto const world = collective::GetWorldSize();
+      auto const rank = collective::GetRank();
+      auto const num_entries = entries.size();
+      std::vector<ExpandEntry> buffer{num_entries * world};
+      std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
+      collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
+      for (auto worker = 0; worker < world; ++worker) {
+        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+          entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
+        }
      }
    }
  }
+
  // Add splits to tree, handles all statistic
  void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
    auto evaluator = tree_evaluator_.GetEvaluator();
@@ -362,24 +383,22 @@ class HistEvaluator {

    GradStats parent_sum = candidate.split.left_sum;
    parent_sum.Add(candidate.split.right_sum);
-    auto base_weight =
-        evaluator.CalcWeight(candidate.nid, param_, GradStats{parent_sum});
-
+    auto base_weight = evaluator.CalcWeight(candidate.nid, *param_, GradStats{parent_sum});
    auto left_weight =
-        evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.left_sum});
+        evaluator.CalcWeight(candidate.nid, *param_, GradStats{candidate.split.left_sum});
    auto right_weight =
-        evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.right_sum});
+        evaluator.CalcWeight(candidate.nid, *param_, GradStats{candidate.split.right_sum});

    if (candidate.split.is_cat) {
      tree.ExpandCategorical(
          candidate.nid, candidate.split.SplitIndex(), candidate.split.cat_bits,
-          candidate.split.DefaultLeft(), base_weight, left_weight * param_.learning_rate,
-          right_weight * param_.learning_rate, candidate.split.loss_chg, parent_sum.GetHess(),
+          candidate.split.DefaultLeft(), base_weight, left_weight * param_->learning_rate,
+          right_weight * param_->learning_rate, candidate.split.loss_chg, parent_sum.GetHess(),
          candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
    } else {
      tree.ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
                      candidate.split.DefaultLeft(), base_weight,
-                      left_weight * param_.learning_rate, right_weight * param_.learning_rate,
+                      left_weight * param_->learning_rate, right_weight * param_->learning_rate,
                      candidate.split.loss_chg, parent_sum.GetHess(),
                      candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
    }
@@ -395,11 +414,11 @@ class HistEvaluator {
    max_node = std::max(candidate.nid, max_node);
    snode_.resize(tree.GetNodes().size());
    snode_.at(left_child).stats = candidate.split.left_sum;
-    snode_.at(left_child).root_gain = evaluator.CalcGain(
-        candidate.nid, param_, GradStats{candidate.split.left_sum});
+    snode_.at(left_child).root_gain =
+        evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.left_sum});
    snode_.at(right_child).stats = candidate.split.right_sum;
-    snode_.at(right_child).root_gain = evaluator.CalcGain(
-        candidate.nid, param_, GradStats{candidate.split.right_sum});
+    snode_.at(right_child).root_gain =
+        evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.right_sum});

    interaction_constraints_.Split(candidate.nid,
                                   tree[candidate.nid].SplitIndex(), left_child,
@@ -409,30 +428,31 @@ class HistEvaluator {
  auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
  auto const& Stats() const { return snode_; }

-  float InitRoot(GradStats const& root_sum) {
+  float InitRoot(GradStats const &root_sum) {
    snode_.resize(1);
    auto root_evaluator = tree_evaluator_.GetEvaluator();

    snode_[0].stats = GradStats{root_sum.GetGrad(), root_sum.GetHess()};
-    snode_[0].root_gain = root_evaluator.CalcGain(RegTree::kRoot, param_,
-                                                  GradStats{snode_[0].stats});
-    auto weight = root_evaluator.CalcWeight(RegTree::kRoot, param_,
-                                            GradStats{snode_[0].stats});
+    snode_[0].root_gain =
+        root_evaluator.CalcGain(RegTree::kRoot, *param_, GradStats{snode_[0].stats});
+    auto weight = root_evaluator.CalcWeight(RegTree::kRoot, *param_, GradStats{snode_[0].stats});
    return weight;
  }

 public:
  // The column sampler must be constructed by caller since we need to preserve the rng
  // for the entire training session.
-  explicit HistEvaluator(TrainParam const &param, MetaInfo const &info, int32_t n_threads,
+  explicit HistEvaluator(Context const *ctx, TrainParam const *param, MetaInfo const &info,
                         std::shared_ptr<common::ColumnSampler> sampler)
-      : param_{param},
+      : ctx_{ctx},
+        param_{param},
        column_sampler_{std::move(sampler)},
-        tree_evaluator_{param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
-        n_threads_{n_threads} {
-    interaction_constraints_.Configure(param, info.num_col_);
-    column_sampler_->Init(info.num_col_, info.feature_weights.HostVector(), param_.colsample_bynode,
-                          param_.colsample_bylevel, param_.colsample_bytree);
+        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
+        is_col_split_{info.data_split_mode == DataSplitMode::kCol} {
+    interaction_constraints_.Configure(*param, info.num_col_);
+    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
+                          param_->colsample_bynode, param_->colsample_bylevel,
+                          param_->colsample_bytree);
  }
 };

@@ -467,6 +487,5 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
    });
  }
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -29,6 +29,7 @@ class HistogramBuilder {
  size_t n_batches_{0};
  // Whether XGBoost is running in distributed environment.
  bool is_distributed_{false};
+  bool is_col_split_{false};

 public:
  /**
@@ -40,7 +41,7 @@ class HistogramBuilder {
   *                         of using global rabit variable.
   */
  void Reset(uint32_t total_bins, BatchParam p, int32_t n_threads, size_t n_batches,
-             bool is_distributed) {
+             bool is_distributed, bool is_col_split) {
    CHECK_GE(n_threads, 1);
    n_threads_ = n_threads;
    n_batches_ = n_batches;
@@ -50,6 +51,7 @@ class HistogramBuilder {
    buffer_.Init(total_bins);
    builder_ = common::GHistBuilder(total_bins);
    is_distributed_ = is_distributed;
+    is_col_split_ = is_col_split;
    // Workaround s390x gcc 7.5.0
    auto DMLC_ATTRIBUTE_UNUSED __force_instantiation = &GradientPairPrecise::Reduce;
  }
@@ -96,7 +98,7 @@ class HistogramBuilder {
                   std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
                   std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
                   RegTree const *p_tree) {
-    if (is_distributed_) {
+    if (is_distributed_ && !is_col_split_) {
      this->AddHistRowsDistributed(starting_index, sync_count, nodes_for_explicit_hist_build,
                                   nodes_for_subtraction_trick, p_tree);
    } else {
@@ -130,7 +132,7 @@ class HistogramBuilder {
      return;
    }

-    if (is_distributed_) {
+    if (is_distributed_ && !is_col_split_) {
      this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
                                     nodes_for_subtraction_trick,
                                     starting_index, sync_count);
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2021 by Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
 * \file param.h
 * \brief training parameters, statistics used to support tree construction.
 * \author Tianqi Chen
@@ -238,9 +238,8 @@ XGBOOST_DEVICE inline static T1 ThresholdL1(T1 w, T2 alpha) {

 // calculate the cost of loss function
 template <typename TrainingParams, typename T>
-XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p,
-                                            T sum_grad, T sum_hess, T w) {
-  return -(T(2.0) * sum_grad * w + (sum_hess + p.reg_lambda) * common::Sqr(w));
+XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad, T sum_hess, T w) {
+  return -(static_cast<T>(2.0) * sum_grad * w + (sum_hess + p.reg_lambda) * common::Sqr(w));
 }

 // calculate weight given the statistics
@@ -261,7 +260,7 @@ XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad,
 template <typename TrainingParams, typename T>
 XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) {
  if (sum_hess < p.min_child_weight || sum_hess <= 0.0) {
-    return T(0.0);
+    return static_cast<T>(0.0);
  }
  if (p.max_delta_step == 0.0f) {
    if (p.reg_alpha == 0.0f) {
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -1069,8 +1069,8 @@ bool LoadModelImpl(Json const& in, TreeParam* param, std::vector<RTreeNodeStat>*
  split_types = std::remove_reference_t<decltype(split_types)>(n_nodes);
  split_categories_segments = std::remove_reference_t<decltype(split_categories_segments)>(n_nodes);

-  static_assert(std::is_integral<decltype(GetElem<Integer>(lefts, 0))>::value, "");
-  static_assert(std::is_floating_point<decltype(GetElem<Number>(loss_changes, 0))>::value, "");
+  static_assert(std::is_integral<decltype(GetElem<Integer>(lefts, 0))>::value);
+  static_assert(std::is_floating_point<decltype(GetElem<Number>(loss_changes, 0))>::value);
  CHECK_EQ(n_nodes, split_categories_segments.size());

  // Set node
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -23,8 +23,7 @@
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {

 DMLC_REGISTRY_FILE_TAG(updater_approx);

@@ -41,7 +40,7 @@ auto BatchSpec(TrainParam const &p, common::Span<float> hess) {

 class GloablApproxBuilder {
 protected:
-  TrainParam param_;
+  TrainParam const* param_;
  std::shared_ptr<common::ColumnSampler> col_sampler_;
  HistEvaluator<CPUExpandEntry> evaluator_;
  HistogramBuilder<CPUExpandEntry> histogram_builder_;
@@ -64,19 +63,19 @@ class GloablApproxBuilder {
    bst_bin_t n_total_bins = 0;
    partitioner_.clear();
    // Generating the GHistIndexMatrix is quite slow, is there a way to speed it up?
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess, task_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess, task_))) {
      if (n_total_bins == 0) {
        n_total_bins = page.cut.TotalBins();
        feature_values_ = page.cut;
      } else {
        CHECK_EQ(n_total_bins, page.cut.TotalBins());
      }
-      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid);
+      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid, p_fmat->IsColumnSplit());
      n_batches_++;
    }

-    histogram_builder_.Reset(n_total_bins, BatchSpec(param_, hess), ctx_->Threads(), n_batches_,
-                             collective::IsDistributed());
+    histogram_builder_.Reset(n_total_bins, BatchSpec(*param_, hess), ctx_->Threads(), n_batches_,
+                             collective::IsDistributed(), p_fmat->IsColumnSplit());
    monitor_->Stop(__func__);
  }

@@ -90,11 +89,13 @@ class GloablApproxBuilder {
    for (auto const &g : gpair) {
      root_sum.Add(g);
    }
-    collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
+    if (p_fmat->IsRowSplit()) {
+      collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
+    }
    std::vector<CPUExpandEntry> nodes{best};
    size_t i = 0;
    auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
      histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
                                   {}, gpair);
      i++;
@@ -103,7 +104,7 @@ class GloablApproxBuilder {
    auto weight = evaluator_.InitRoot(root_sum);
    p_tree->Stat(RegTree::kRoot).sum_hess = root_sum.GetHess();
    p_tree->Stat(RegTree::kRoot).base_weight = weight;
-    (*p_tree)[RegTree::kRoot].SetLeaf(param_.learning_rate * weight);
+    (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);

    auto const &histograms = histogram_builder_.Histogram();
    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
@@ -145,7 +146,7 @@ class GloablApproxBuilder {

    size_t i = 0;
    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
      histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
                                   nodes_to_build, nodes_to_sub, gpair);
      i++;
@@ -166,12 +167,12 @@ class GloablApproxBuilder {
  }

 public:
-  explicit GloablApproxBuilder(TrainParam param, MetaInfo const &info, Context const *ctx,
+  explicit GloablApproxBuilder(TrainParam const *param, MetaInfo const &info, Context const *ctx,
                               std::shared_ptr<common::ColumnSampler> column_sampler, ObjInfo task,
                               common::Monitor *monitor)
-      : param_{std::move(param)},
+      : param_{param},
        col_sampler_{std::move(column_sampler)},
-        evaluator_{param_, info, ctx->Threads(), col_sampler_},
+        evaluator_{ctx, param_, info, col_sampler_},
        ctx_{ctx},
        task_{task},
        monitor_{monitor} {}
@@ -181,7 +182,7 @@ class GloablApproxBuilder {
    p_last_tree_ = p_tree;
    this->InitData(p_fmat, hess);

-    Driver<CPUExpandEntry> driver(param_);
+    Driver<CPUExpandEntry> driver(*param_);
    auto &tree = *p_tree;
    driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)});
    auto expand_set = driver.Pop();
@@ -211,7 +212,7 @@ class GloablApproxBuilder {

      monitor_->Start("UpdatePosition");
      size_t page_id = 0;
-      for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
+      for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
        partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
        page_id++;
      }
@@ -248,7 +249,6 @@ class GloablApproxBuilder {
 *        iteration.
 */
 class GlobalApproxUpdater : public TreeUpdater {
-  TrainParam param_;
  common::Monitor monitor_;
  // specializations for different histogram precision.
  std::unique_ptr<GloablApproxBuilder> pimpl_;
@@ -263,15 +263,9 @@ class GlobalApproxUpdater : public TreeUpdater {
    monitor_.Init(__func__);
  }

-  void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); }
-  void LoadConfig(Json const &in) override {
-    auto const &config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
-  }
-  void SaveConfig(Json *p_out) const override {
-    auto &out = *p_out;
-    out["train_param"] = ToJson(param_);
-  }
+  void Configure(Args const &) override {}
+  void LoadConfig(Json const &) override {}
+  void SaveConfig(Json *) const override {}

  void InitData(TrainParam const &param, HostDeviceVector<GradientPair> const *gpair,
                linalg::Matrix<GradientPair> *sampled) {
@@ -281,20 +275,17 @@ class GlobalApproxUpdater : public TreeUpdater {
    SampleGradient(ctx_, param, sampled->HostView());
  }

-  char const *Name() const override { return "grow_histmaker"; }
+  [[nodiscard]] char const *Name() const override { return "grow_histmaker"; }

-  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *m,
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *m,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree *> &trees) override {
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
-
-    pimpl_ = std::make_unique<GloablApproxBuilder>(param_, m->Info(), ctx_, column_sampler_, task_,
+    pimpl_ = std::make_unique<GloablApproxBuilder>(param, m->Info(), ctx_, column_sampler_, task_,
                                                   &monitor_);

    linalg::Matrix<GradientPair> h_gpair;
    // Obtain the hessian values for weighted sketching
-    InitData(param_, gpair, &h_gpair);
+    InitData(*param, gpair, &h_gpair);
    std::vector<float> hess(h_gpair.Size());
    auto const &s_gpair = h_gpair.Data()->ConstHostVector();
    std::transform(s_gpair.begin(), s_gpair.end(), hess.begin(),
@@ -302,12 +293,11 @@ class GlobalApproxUpdater : public TreeUpdater {

    cached_ = m;

-    size_t t_idx = 0;
+    std::size_t t_idx = 0;
    for (auto p_tree : trees) {
      this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
      ++t_idx;
    }
-    param_.learning_rate = lr;
  }

  bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
@@ -318,7 +308,7 @@ class GlobalApproxUpdater : public TreeUpdater {
    return true;
  }

-  bool HasNodePosition() const override { return true; }
+  [[nodiscard]] bool HasNodePosition() const override { return true; }
 };

 DMLC_REGISTRY_FILE_TAG(grow_histmaker);
@@ -328,5 +318,4 @@ XGBOOST_REGISTER_TREE_UPDATER(GlobalHistMaker, "grow_histmaker")
        "Tree constructor that uses approximate histogram construction "
        "for each node.")
    .set_body([](Context const *ctx, ObjInfo task) { return new GlobalApproxUpdater(ctx, task); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
 * \file updater_colmaker.cc
 * \brief use columnwise update to construct a tree
 * \author Tianqi Chen
@@ -17,8 +17,7 @@
 #include "../common/random.h"
 #include "split_evaluator.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {

 DMLC_REGISTRY_FILE_TAG(updater_colmaker);

@@ -57,18 +56,15 @@ class ColMaker: public TreeUpdater {
 public:
  explicit ColMaker(Context const *ctx) : TreeUpdater(ctx) {}
  void Configure(const Args &args) override {
-    param_.UpdateAllowUnknown(args);
    colmaker_param_.UpdateAllowUnknown(args);
  }

  void LoadConfig(Json const& in) override {
    auto const& config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
    FromJson(config.at("colmaker_train_param"), &this->colmaker_param_);
  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["train_param"] = ToJson(param_);
+  void SaveConfig(Json *p_out) const override {
+    auto &out = *p_out;
    out["colmaker_train_param"] = ToJson(colmaker_param_);
  }

@@ -95,7 +91,7 @@ class ColMaker: public TreeUpdater {
    }
  }

-  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
              common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
              const std::vector<RegTree *> &trees) override {
    if (collective::IsDistributed()) {
@@ -108,22 +104,16 @@ class ColMaker: public TreeUpdater {
    }
    this->LazyGetColumnDensity(dmat);
    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
-    interaction_constraints_.Configure(param_, dmat->Info().num_row_);
+    interaction_constraints_.Configure(*param, dmat->Info().num_row_);
    // build tree
    for (auto tree : trees) {
      CHECK(ctx_);
-      Builder builder(param_, colmaker_param_, interaction_constraints_, ctx_,
-                      column_densities_);
+      Builder builder(*param, colmaker_param_, interaction_constraints_, ctx_, column_densities_);
      builder.Update(gpair->ConstHostVector(), dmat, tree);
    }
-    param_.learning_rate = lr;
  }

 protected:
-  // training parameter
-  TrainParam param_;
  ColMakerTrainParam colmaker_param_;
  // SplitEvaluator that will be cloned for each Builder
  std::vector<float> column_densities_;
@@ -234,9 +224,9 @@ class ColMaker: public TreeUpdater {
        }
      }
      {
-        column_sampler_.Init(fmat.Info().num_col_, fmat.Info().feature_weights.ConstHostVector(),
-                             param_.colsample_bynode, param_.colsample_bylevel,
-                             param_.colsample_bytree);
+        column_sampler_.Init(ctx_, fmat.Info().num_col_,
+                             fmat.Info().feature_weights.ConstHostVector(), param_.colsample_bynode,
+                             param_.colsample_bylevel, param_.colsample_bytree);
      }
      {
        // setup temp space for each thread
@@ -614,5 +604,4 @@ class ColMaker: public TreeUpdater {
 XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "grow_colmaker")
    .describe("Grow tree with parallelization over columns.")
    .set_body([](Context const *ctx, ObjInfo) { return new ColMaker(ctx); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2023 by XGBoost contributors
 */
 #include <thrust/copy.h>
 #include <thrust/reduce.h>
@@ -160,11 +160,11 @@ class DeviceHistogramStorage {
    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
      // Fetch from normal cache
      auto ptr = data_.data().get() + nidx_map_.at(nidx);
-      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
    } else {
      // Fetch from overflow
      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
-      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
    }
  }
 };
@@ -243,7 +243,7 @@ struct GPUHistMakerDevice {
  // thread safe
  void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
    auto const& info = dmat->Info();
-    this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
+    this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
                              param.colsample_bynode, param.colsample_bylevel,
                              param.colsample_bytree);
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
@@ -306,6 +306,8 @@ struct GPUHistMakerDevice {
        matrix.is_dense
    };
    dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
+    // Store the feature set ptrs so they dont go out of scope before the kernel is called
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
    for (size_t i = 0; i < candidates.size(); i++) {
      auto candidate = candidates.at(i);
      int left_nidx = tree[candidate.nid].LeftChild();
@@ -314,10 +316,12 @@ struct GPUHistMakerDevice {
      nidx[i * 2 + 1] = right_nidx;
      auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
      left_sampled_features->SetDevice(ctx_->gpu_id);
+      feature_sets.emplace_back(left_sampled_features);
      common::Span<bst_feature_t> left_feature_set =
          interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
      auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
      right_sampled_features->SetDevice(ctx_->gpu_id);
+      feature_sets.emplace_back(right_sampled_features);
      common::Span<bst_feature_t> right_feature_set =
          interaction_constraints.Query(right_sampled_features->DeviceSpan(),
                                        right_nidx);
@@ -330,8 +334,8 @@ struct GPUHistMakerDevice {
    }
    bst_feature_t max_active_features = 0;
    for (auto input : h_node_inputs) {
-      max_active_features = std::max(max_active_features,
-                                     bst_feature_t(input.feature_set.size()));
+      max_active_features =
+          std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
    }
    dh::safe_cuda(cudaMemcpyAsync(
        d_node_inputs.data().get(), h_node_inputs.data(),
@@ -752,7 +756,6 @@ class GPUHistMaker : public TreeUpdater {
  void Configure(const Args& args) override {
    // Used in test to count how many configurations are performed
    LOG(DEBUG) << "[GPU Hist]: Configure";
-    param_.UpdateAllowUnknown(args);
    hist_maker_param_.UpdateAllowUnknown(args);
    dh::CheckComputeCapability();
    initialised_ = false;
@@ -764,32 +767,26 @@ class GPUHistMaker : public TreeUpdater {
    auto const& config = get<Object const>(in);
    FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
    initialised_ = false;
-    FromJson(config.at("train_param"), &param_);
  }
  void SaveConfig(Json* p_out) const override {
    auto& out = *p_out;
    out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
-    out["train_param"] = ToJson(param_);
  }

  ~GPUHistMaker() {  // NOLINT
    dh::GlobalMemoryLogger().Log();
  }

-  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) override {
    monitor_.Start("Update");

-    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
-
    // build tree
    try {
      size_t t_idx{0};
      for (xgboost::RegTree* tree : trees) {
-        this->UpdateTree(gpair, dmat, tree, &out_position[t_idx]);
+        this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);

        if (hist_maker_param_.debug_synchronize) {
          this->CheckTreesSynchronized(tree);
@@ -800,12 +797,10 @@ class GPUHistMaker : public TreeUpdater {
    } catch (const std::exception& e) {
      LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
    }
-
-    param_.learning_rate = lr;
    monitor_.Stop("Update");
  }

-  void InitDataOnce(DMatrix* dmat) {
+  void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
    info_ = &dmat->Info();

@@ -814,24 +809,24 @@ class GPUHistMaker : public TreeUpdater {
    collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);

    BatchParam batch_param{
-      ctx_->gpu_id,
-      param_.max_bin,
+        ctx_->gpu_id,
+        param->max_bin,
    };
    auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
    info_->feature_types.SetDevice(ctx_->gpu_id);
    maker.reset(new GPUHistMakerDevice<GradientSumT>(
-        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, param_,
+        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
        column_sampling_seed, info_->num_col_, batch_param));

    p_last_fmat_ = dmat;
    initialised_ = true;
  }

-  void InitData(DMatrix* dmat, RegTree const* p_tree) {
+  void InitData(TrainParam const* param, DMatrix* dmat, RegTree const* p_tree) {
    if (!initialised_) {
      monitor_.Start("InitDataOnce");
-      this->InitDataOnce(dmat);
+      this->InitDataOnce(param, dmat);
      monitor_.Stop("InitDataOnce");
    }
    p_last_tree_ = p_tree;
@@ -852,10 +847,10 @@ class GPUHistMaker : public TreeUpdater {
    CHECK(*local_tree == reference_tree);
  }

-  void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
-                  HostDeviceVector<bst_node_t>* p_out_position) {
+  void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+                  RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
    monitor_.Start("InitData");
-    this->InitData(p_fmat, p_tree);
+    this->InitData(param, p_fmat, p_tree);
    monitor_.Stop("InitData");

    gpair->SetDevice(ctx_->gpu_id);
@@ -874,7 +869,6 @@ class GPUHistMaker : public TreeUpdater {
    return result;
  }

-  TrainParam param_;  // NOLINT
  MetaInfo* info_{};  // NOLINT

  std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker;  // NOLINT
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
 * \file updater_prune.cc
 * \brief prune a tree given the statistics
 * \author Tianqi Chen
@@ -8,13 +8,11 @@

 #include <memory>

+#include "../common/timer.h"
+#include "./param.h"
 #include "xgboost/base.h"
 #include "xgboost/json.h"
-#include "./param.h"
-#include "../common/timer.h"
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_prune);

 /*! \brief pruner that prunes a tree after growing finishes */
@@ -24,47 +22,31 @@ class TreePruner : public TreeUpdater {
    syncher_.reset(TreeUpdater::Create("sync", ctx_, task));
    pruner_monitor_.Init("TreePruner");
  }
-  char const* Name() const override {
-    return "prune";
-  }
-
+  [[nodiscard]] char const* Name() const override { return "prune"; }
  // set training parameter
-  void Configure(const Args& args) override {
-    param_.UpdateAllowUnknown(args);
-    syncher_->Configure(args);
-  }
+  void Configure(const Args& args) override { syncher_->Configure(args); }

-  void LoadConfig(Json const& in) override {
-    auto const& config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["train_param"] = ToJson(param_);
-  }
-  bool CanModifyTree() const override {
-    return true;
-  }
+  void LoadConfig(Json const&) override {}
+  void SaveConfig(Json*) const override {}
+  [[nodiscard]] bool CanModifyTree() const override { return true; }

  // update the tree, do pruning
-  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) override {
    pruner_monitor_.Start("PrunerUpdate");
-    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
    for (auto tree : trees) {
-      this->DoPrune(tree);
+      this->DoPrune(param, tree);
    }
-    param_.learning_rate = lr;
-    syncher_->Update(gpair, p_fmat, out_position, trees);
+    syncher_->Update(param, gpair, p_fmat, out_position, trees);
    pruner_monitor_.Stop("PrunerUpdate");
  }

 private:
  // try to prune off current leaf
-  bst_node_t TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) { // NOLINT(*)
+  bst_node_t TryPruneLeaf(TrainParam const* param, RegTree* p_tree, int nid, int depth,
+                          int npruned) {
+    auto& tree = *p_tree;
    CHECK(tree[nid].IsLeaf());
    if (tree[nid].IsRoot()) {
      return npruned;
@@ -77,22 +59,22 @@ class TreePruner : public TreeUpdater {
    auto right = tree[pid].RightChild();
    bool balanced = tree[left].IsLeaf() &&
                    right != RegTree::kInvalidNodeId && tree[right].IsLeaf();
-    if (balanced && param_.NeedPrune(s.loss_chg, depth)) {
+    if (balanced && param->NeedPrune(s.loss_chg, depth)) {
      // need to be pruned
-      tree.ChangeToLeaf(pid, param_.learning_rate * s.base_weight);
+      tree.ChangeToLeaf(pid, param->learning_rate * s.base_weight);
      // tail recursion
-      return this->TryPruneLeaf(tree, pid, depth - 1, npruned + 2);
+      return this->TryPruneLeaf(param, p_tree, pid, depth - 1, npruned + 2);
    } else {
      return npruned;
    }
  }
  /*! \brief do pruning of a tree */
-  void DoPrune(RegTree* p_tree) {
+  void DoPrune(TrainParam const* param, RegTree* p_tree) {
    auto& tree = *p_tree;
    bst_node_t npruned = 0;
    for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
      if (tree[nid].IsLeaf() && !tree[nid].IsDeleted()) {
-        npruned = this->TryPruneLeaf(tree, nid, tree.GetDepth(nid), npruned);
+        npruned = this->TryPruneLeaf(param, p_tree, nid, tree.GetDepth(nid), npruned);
      }
    }
    LOG(INFO) << "tree pruning end, "
@@ -103,13 +85,10 @@ class TreePruner : public TreeUpdater {
 private:
  // synchronizer
  std::unique_ptr<TreeUpdater> syncher_;
-  // training parameter
-  TrainParam param_;
  common::Monitor pruner_monitor_;
 };

 XGBOOST_REGISTER_TREE_UPDATER(TreePruner, "prune")
    .describe("Pruner that prune the tree according to statistics.")
    .set_body([](Context const* ctx, ObjInfo task) { return new TreePruner(ctx, task); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -28,21 +28,14 @@ namespace tree {

 DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);

-void QuantileHistMaker::Configure(const Args &args) {
-  param_.UpdateAllowUnknown(args);
-}
-
-void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+void QuantileHistMaker::Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair,
+                               DMatrix *dmat,
                               common::Span<HostDeviceVector<bst_node_t>> out_position,
                               const std::vector<RegTree *> &trees) {
-  // rescale learning rate according to size of trees
-  float lr = param_.learning_rate;
-  param_.learning_rate = lr / trees.size();
-
  // build tree
  const size_t n_trees = trees.size();
  if (!pimpl_) {
-    pimpl_.reset(new Builder(n_trees, param_, dmat, task_, ctx_));
+    pimpl_.reset(new Builder(n_trees, param, dmat, task_, ctx_));
  }

  size_t t_idx{0};
@@ -51,8 +44,6 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *d
    this->pimpl_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
    ++t_idx;
  }
-
-  param_.learning_rate = lr;
 }

 bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
@@ -107,7 +98,7 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
    auto weight = evaluator_->InitRoot(GradStats{grad_stat});
    p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
    p_tree->Stat(RegTree::kRoot).base_weight = weight;
-    (*p_tree)[RegTree::kRoot].SetLeaf(param_.learning_rate * weight);
+    (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);

    std::vector<CPUExpandEntry> entries{node};
    monitor_->Start("EvaluateSplits");
@@ -173,7 +164,7 @@ void QuantileHistMaker::Builder::ExpandTree(DMatrix *p_fmat, RegTree *p_tree,
                                            HostDeviceVector<bst_node_t> *p_out_position) {
  monitor_->Start(__func__);

-  Driver<CPUExpandEntry> driver(param_);
+  Driver<CPUExpandEntry> driver(*param_);
  driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
  auto const &tree = *p_tree;
  auto expand_set = driver.Pop();
@@ -277,21 +268,19 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
      } else {
        CHECK_EQ(n_total_bins, page.cut.TotalBins());
      }
-      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid);
+      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid, fmat->IsColumnSplit());
      ++page_id;
    }
    histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
-                              collective::IsDistributed());
+                              collective::IsDistributed(), fmat->IsColumnSplit());

-    auto m_gpair =
-        linalg::MakeTensorView(*gpair, {gpair->size(), static_cast<std::size_t>(1)}, ctx_->gpu_id);
-    SampleGradient(ctx_, param_, m_gpair);
+    auto m_gpair = linalg::MakeTensorView(ctx_, *gpair, gpair->size(), static_cast<std::size_t>(1));
+    SampleGradient(ctx_, *param_, m_gpair);
  }

  // store a pointer to the tree
  p_last_tree_ = &tree;
-  evaluator_.reset(
-      new HistEvaluator<CPUExpandEntry>{param_, info, this->ctx_->Threads(), column_sampler_});
+  evaluator_.reset(new HistEvaluator<CPUExpandEntry>{ctx_, param_, info, column_sampler_});

  monitor_->Stop(__func__);
 }
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -35,49 +35,36 @@
 #include "../common/partition_builder.h"
 #include "../common/column_matrix.h"

-namespace xgboost {
-namespace tree {
-inline BatchParam HistBatch(TrainParam const& param) {
-  return {param.max_bin, param.sparse_threshold};
+namespace xgboost::tree {
+inline BatchParam HistBatch(TrainParam const* param) {
+  return {param->max_bin, param->sparse_threshold};
 }

 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker: public TreeUpdater {
 public:
  explicit QuantileHistMaker(Context const* ctx, ObjInfo task) : TreeUpdater(ctx), task_{task} {}
-  void Configure(const Args& args) override;
+  void Configure(const Args&) override {}

-  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) override;

  bool UpdatePredictionCache(const DMatrix *data,
                             linalg::VectorView<float> out_preds) override;

-  void LoadConfig(Json const& in) override {
-    auto const& config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["train_param"] = ToJson(param_);
-  }
+  void LoadConfig(Json const&) override {}
+  void SaveConfig(Json*) const override {}

-  char const* Name() const override {
-    return "grow_quantile_histmaker";
-  }
-
-  bool HasNodePosition() const override { return true; }
+  [[nodiscard]] char const* Name() const override { return "grow_quantile_histmaker"; }
+  [[nodiscard]] bool HasNodePosition() const override { return true; }

 protected:
-  // training parameter
-  TrainParam param_;
-
  // actual builder that runs the algorithm
  struct Builder {
   public:
    // constructor
-    explicit Builder(const size_t n_trees, const TrainParam& param, DMatrix const* fmat,
+    explicit Builder(const size_t n_trees, TrainParam const* param, DMatrix const* fmat,
                     ObjInfo task, Context const* ctx)
        : n_trees_(n_trees),
          param_(param),
@@ -115,7 +102,7 @@ class QuantileHistMaker: public TreeUpdater {

   private:
    const size_t n_trees_;
-    const TrainParam& param_;
+    TrainParam const* param_;
    std::shared_ptr<common::ColumnSampler> column_sampler_{
        std::make_shared<common::ColumnSampler>()};

@@ -140,7 +127,6 @@ class QuantileHistMaker: public TreeUpdater {
  std::unique_ptr<Builder> pimpl_;
  ObjInfo task_;
 };
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree

 #endif  // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
 * \file updater_refresh.cc
 * \brief refresh the statistics and leaf value on the tree on the dataset
 * \author Tianqi Chen
@@ -16,8 +16,7 @@
 #include "./param.h"
 #include "xgboost/json.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {

 DMLC_REGISTRY_FILE_TAG(updater_refresh);

@@ -25,23 +24,14 @@ DMLC_REGISTRY_FILE_TAG(updater_refresh);
 class TreeRefresher : public TreeUpdater {
 public:
  explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}
-  void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); }
-  void LoadConfig(Json const& in) override {
-    auto const& config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["train_param"] = ToJson(param_);
-  }
-  char const* Name() const override {
-    return "refresh";
-  }
-  bool CanModifyTree() const override {
-    return true;
-  }
+  void Configure(const Args &) override {}
+  void LoadConfig(Json const &) override {}
+  void SaveConfig(Json *) const override {}
+
+  [[nodiscard]] char const *Name() const override { return "refresh"; }
+  [[nodiscard]] bool CanModifyTree() const override { return true; }
  // update the tree, do pruning
-  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
              common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
              const std::vector<RegTree *> &trees) override {
    if (trees.size() == 0) return;
@@ -103,16 +93,11 @@ class TreeRefresher : public TreeUpdater {
    lazy_get_stats();
    collective::Allreduce<collective::Operation::kSum>(&dmlc::BeginPtr(stemp[0])->sum_grad,
                                                       stemp[0].size() * 2);
-    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
    int offset = 0;
    for (auto tree : trees) {
-      this->Refresh(dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
+      this->Refresh(param, dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
      offset += tree->param.num_nodes;
    }
-    // set learning rate back
-    param_.learning_rate = lr;
  }

 private:
@@ -135,31 +120,27 @@ class TreeRefresher : public TreeUpdater {
      gstats[pid].Add(gpair[ridx]);
    }
  }
-  inline void Refresh(const GradStats *gstats,
-                      int nid, RegTree *p_tree) {
+  inline void Refresh(TrainParam const *param, const GradStats *gstats, int nid, RegTree *p_tree) {
    RegTree &tree = *p_tree;
    tree.Stat(nid).base_weight =
-        static_cast<bst_float>(CalcWeight(param_, gstats[nid]));
+        static_cast<bst_float>(CalcWeight(*param, gstats[nid]));
    tree.Stat(nid).sum_hess = static_cast<bst_float>(gstats[nid].sum_hess);
    if (tree[nid].IsLeaf()) {
-      if (param_.refresh_leaf) {
-        tree[nid].SetLeaf(tree.Stat(nid).base_weight * param_.learning_rate);
+      if (param->refresh_leaf) {
+        tree[nid].SetLeaf(tree.Stat(nid).base_weight * param->learning_rate);
      }
    } else {
-      tree.Stat(nid).loss_chg = static_cast<bst_float>(
-          xgboost::tree::CalcGain(param_, gstats[tree[nid].LeftChild()]) +
-          xgboost::tree::CalcGain(param_, gstats[tree[nid].RightChild()]) -
-          xgboost::tree::CalcGain(param_, gstats[nid]));
-      this->Refresh(gstats, tree[nid].LeftChild(), p_tree);
-      this->Refresh(gstats, tree[nid].RightChild(), p_tree);
+      tree.Stat(nid).loss_chg =
+          static_cast<bst_float>(xgboost::tree::CalcGain(*param, gstats[tree[nid].LeftChild()]) +
+                                 xgboost::tree::CalcGain(*param, gstats[tree[nid].RightChild()]) -
+                                 xgboost::tree::CalcGain(*param, gstats[nid]));
+      this->Refresh(param, gstats, tree[nid].LeftChild(), p_tree);
+      this->Refresh(param, gstats, tree[nid].RightChild(), p_tree);
    }
  }
-  // training parameter
-  TrainParam param_;
 };

 XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh")
    .describe("Refresher that refreshes the weight and statistics according to data.")
    .set_body([](Context const *ctx, ObjInfo) { return new TreeRefresher(ctx); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2019 by Contributors
+/**
+ * Copyright 2014-2013 by XBGoost Contributors
 * \file updater_sync.cc
 * \brief synchronize the tree in all distributed nodes
 */
@@ -13,8 +13,7 @@
 #include "../common/io.h"
 #include "xgboost/json.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {

 DMLC_REGISTRY_FILE_TAG(updater_sync);

@@ -30,11 +29,9 @@ class TreeSyncher : public TreeUpdater {
  void LoadConfig(Json const&) override {}
  void SaveConfig(Json*) const override {}

-  char const* Name() const override {
-    return "prune";
-  }
+  [[nodiscard]] char const* Name() const override { return "prune"; }

-  void Update(HostDeviceVector<GradientPair>*, DMatrix*,
+  void Update(TrainParam const*, HostDeviceVector<GradientPair>*, DMatrix*,
              common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
              const std::vector<RegTree*>& trees) override {
    if (collective::GetWorldSize() == 1) return;
@@ -57,5 +54,4 @@ class TreeSyncher : public TreeUpdater {
 XGBOOST_REGISTER_TREE_UPDATER(TreeSyncher, "sync")
    .describe("Syncher that synchronize the tree in all distributed nodes.")
    .set_body([](Context const* ctx, ObjInfo) { return new TreeSyncher(ctx); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree