temp merge, disable 1 line, SetValid

2023-10-12 16:16:44 -07:00
parent 2e7e9d3b2d 85d3017ca5
commit ea19555474
492 changed files with 15533 additions and 9376 deletions
--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
@@ -140,20 +140,20 @@ void FeatureInteractionConstraintDevice::Reset() {
 __global__ void ClearBuffersKernel(
    LBitField64 result_buffer_output, LBitField64 result_buffer_input) {
  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < result_buffer_output.Size()) {
+  if (tid < result_buffer_output.Capacity()) {
    result_buffer_output.Clear(tid);
  }
-  if (tid < result_buffer_input.Size()) {
+  if (tid < result_buffer_input.Capacity()) {
    result_buffer_input.Clear(tid);
  }
 }

 void FeatureInteractionConstraintDevice::ClearBuffers() {
-  CHECK_EQ(output_buffer_bits_.Size(), input_buffer_bits_.Size());
-  CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
+  CHECK_EQ(output_buffer_bits_.Capacity(), input_buffer_bits_.Capacity());
+  CHECK_LE(feature_buffer_.Capacity(), output_buffer_bits_.Capacity());
  uint32_t constexpr kBlockThreads = 256;
  auto const n_grids = static_cast<uint32_t>(
-      common::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
+      common::DivRoundUp(input_buffer_bits_.Capacity(), kBlockThreads));
  dh::LaunchKernel {n_grids, kBlockThreads} (
      ClearBuffersKernel,
      output_buffer_bits_, input_buffer_bits_);
@@ -207,11 +207,11 @@ common::Span<bst_feature_t> FeatureInteractionConstraintDevice::Query(
  ClearBuffers();

  LBitField64 node_constraints = s_node_constraints_[nid];
-  CHECK_EQ(input_buffer_bits_.Size(), output_buffer_bits_.Size());
+  CHECK_EQ(input_buffer_bits_.Capacity(), output_buffer_bits_.Capacity());

  uint32_t constexpr kBlockThreads = 256;
  auto n_grids = static_cast<uint32_t>(
-      common::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
+      common::DivRoundUp(output_buffer_bits_.Capacity(), kBlockThreads));
  dh::LaunchKernel {n_grids, kBlockThreads} (
      SetInputBufferKernel,
      feature_list, input_buffer_bits_);
@@ -274,13 +274,13 @@ __global__ void InteractionConstraintSplitKernel(LBitField64 feature,
                                                 LBitField64 left,
                                                 LBitField64 right) {
  auto tid = threadIdx.x + blockDim.x * blockIdx.x;
-  if (tid > node.Size()) {
+  if (tid > node.Capacity()) {
    return;
  }
  // enable constraints from feature
  node |= feature;
  // clear the buffer after use
-  if (tid < feature.Size()) {
+  if (tid < feature.Capacity()) {
    feature.Clear(tid);
  }

@@ -323,7 +323,7 @@ void FeatureInteractionConstraintDevice::Split(
      s_sets_, s_sets_ptr_);

  uint32_t constexpr kBlockThreads = 256;
-  auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Size(), kBlockThreads));
+  auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Capacity(), kBlockThreads));

  dh::LaunchKernel {n_grids, kBlockThreads} (
      InteractionConstraintSplitKernel,
--- a/src/tree/constraints.h
+++ b/src/tree/constraints.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2019 by Contributors
+/**
+ * Copyright 2018-2023 by Contributors
 */
 #ifndef XGBOOST_TREE_CONSTRAINTS_H_
 #define XGBOOST_TREE_CONSTRAINTS_H_
@@ -8,10 +8,8 @@
 #include <unordered_set>
 #include <vector>

-#include "xgboost/span.h"
-#include "xgboost/base.h"
-
 #include "param.h"
+#include "xgboost/base.h"

 namespace xgboost {
 /*!
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -55,27 +55,26 @@ void FitStump(Context const* ctx, MetaInfo const& info,
 }  // namespace cpu_impl

 namespace cuda_impl {
-void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
-              linalg::VectorView<float> out);
+void FitStump(Context const* ctx, MetaInfo const& info,
+              linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out);

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
-inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
+inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<GradientPair const, 2>,
                     linalg::VectorView<float>) {
  common::AssertGPUSupport();
 }
 #endif  // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
 }  // namespace cuda_impl

-void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
              bst_target_t n_targets, linalg::Vector<float>* out) {
  out->SetDevice(ctx->gpu_id);
  out->Reshape(n_targets);
-  auto n_samples = gpair.Size() / n_targets;

-  gpair.SetDevice(ctx->gpu_id);
-  auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets);
+  gpair.SetDevice(ctx->Device());
+  auto gpair_t = gpair.View(ctx->Device());
  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
-               : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
+      : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
 }
 }  // namespace tree
 }  // namespace xgboost
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023 by XGBoost Contributors
 *
 * \brief Utilities for estimating initial score.
 */
@@ -11,6 +11,7 @@

 #include <cstddef>                                // std::size_t

+#include "../collective/aggregator.cuh"
 #include "../collective/communicator-inl.cuh"
 #include "../common/device_helpers.cuh"           // dh::MakeTransformIterator
 #include "fit_stump.h"
@@ -23,8 +24,8 @@
 namespace xgboost {
 namespace tree {
 namespace cuda_impl {
-void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
-              linalg::VectorView<float> out) {
+void FitStump(Context const* ctx, MetaInfo const& info,
+              linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
  auto n_targets = out.Size();
  CHECK_EQ(n_targets, gpair.Shape(1));
  linalg::Vector<GradientPairPrecise> sum = linalg::Constant(ctx, GradientPairPrecise{}, n_targets);
@@ -41,7 +42,7 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
        auto sample = i % gpair.Shape(0);
        return GradientPairPrecise{gpair(sample, target)};
      });
-  auto d_sum = sum.View(ctx->gpu_id);
+  auto d_sum = sum.View(ctx->Device());
  CHECK(d_sum.CContiguous());

  dh::XGBCachingDeviceAllocator<char> alloc;
@@ -55,8 +56,8 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
  thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
                        thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));

-  collective::AllReduce<collective::Operation::kSum>(
-      ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()), d_sum.Size() * 2);
+  collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
+                        d_sum.Size() * 2);

  thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
                     [=] XGBOOST_DEVICE(std::size_t i) mutable {
--- a/src/tree/fit_stump.h
+++ b/src/tree/fit_stump.h
@@ -31,7 +31,7 @@ XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
 /**
 * @brief Fit a tree stump as an estimation of base_score.
 */
-void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
              bst_target_t n_targets, linalg::Vector<float>* out);
 }  // namespace tree
 }  // namespace xgboost
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -1,12 +1,12 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
 */
 #include <algorithm>  // std::max
 #include <vector>
 #include <limits>

+#include "../../collective/communicator-inl.cuh"
 #include "../../common/categorical.h"
-#include "../../common/device_helpers.cuh"
 #include "../../data/ellpack_page.cuh"
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
@@ -24,13 +24,11 @@
 #define WARP_SIZE 32
 #endif

-namespace xgboost {
 #if defined(XGBOOST_USE_HIP)
 namespace cub = hipcub;
 #endif

-namespace tree {
-
+namespace xgboost::tree {
 // With constraints
 XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
                                       const GradientPairInt64 &missing,
@@ -352,11 +350,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
                                    common::Span<common::CatBitField::value_type> out,
                                    DeviceSplitCandidate *p_out_split) {
  auto &out_split = *p_out_split;
-  out_split.split_cats = common::CatBitField{out};
+  auto out_cats = common::CatBitField{out};

  // Simple case for one hot split
  if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
-    out_split.split_cats.Set(common::AsCat(out_split.thresh));
+    out_cats.Set(common::AsCat(out_split.thresh));
    return;
  }

@@ -376,7 +374,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
  assert(partition > 0 && "Invalid partition.");
  thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
    auto cat = shared_inputs.feature_values[c - node_offset];
-    out_split.SetCat(cat);
+    out_cats.Set(common::AsCat(cat));
  });
 }

@@ -453,6 +451,24 @@ void GPUHistEvaluator::EvaluateSplits(
  this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
                             evaluator, out_splits);

+  if (is_column_split_) {
+    // With column-wise data split, we gather the split candidates from all the workers and find the
+    // global best candidates.
+    auto const world_size = collective::GetWorldSize();
+    dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
+    auto all_candidates = dh::ToSpan(all_candidate_storage);
+    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+                          out_splits.size() * sizeof(DeviceSplitCandidate));
+
+    // Reduce to get the best candidate from all workers.
+    dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
+      out_splits[i] = all_candidates[i];
+      for (auto rank = 1; rank < world_size; rank++) {
+        out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
+      }
+    });
+  }
+
  auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
  auto d_entries = out_entries;
  auto device_cats_accessor = this->DeviceCatStorage(nidx);
@@ -471,8 +487,7 @@ void GPUHistEvaluator::EvaluateSplits(

    if (split.is_cat) {
      SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i,
-                          device_cats_accessor.GetNodeCatStorage(input.nidx),
-                          &out_splits[i]);
+                          device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]);
    }

    float base_weight =
@@ -510,6 +525,4 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
 #endif
  return root_entry;
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -37,8 +37,8 @@ struct EvaluateSplitSharedInputs {
  common::Span<const float> feature_values;
  common::Span<const float> min_fvalue;
  bool is_dense;
-  XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
-  __device__ auto FeatureBins(bst_feature_t fidx) const {
+  [[nodiscard]] XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
+  [[nodiscard]] __device__ std::uint32_t FeatureBins(bst_feature_t fidx) const {
    return feature_segments[fidx + 1] - feature_segments[fidx];
  }
 };
@@ -83,6 +83,9 @@ class GPUHistEvaluator {
  // Number of elements of categorical storage type
  // needed to hold categoricals for a single mode
  std::size_t node_categorical_storage_size_ = 0;
+  // Is the data split column-wise?
+  bool is_column_split_ = false;
+  int32_t device_;

  // Copy the categories from device to host asynchronously.
  void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -102,7 +105,7 @@ class GPUHistEvaluator {
  }

  /**
-   * \brief Get device category storage of nidx for internal calculation.
+   * @brief Get device category storage of nidx for internal calculation.
   */
  auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
    if (!has_categoricals_) return CatAccessor{};
@@ -117,8 +120,8 @@ class GPUHistEvaluator {
  /**
   * \brief Get sorted index storage based on the left node of inputs.
   */
-  auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
-    if(!need_sort_histogram_) return common::Span<bst_feature_t>();
+  auto SortedIdx(int num_nodes, bst_bin_t total_bins) {
+    if (!need_sort_histogram_) return common::Span<bst_feature_t>{};
    cat_sorted_idx_.resize(num_nodes * total_bins);
    return dh::ToSpan(cat_sorted_idx_);
  }
@@ -136,18 +139,29 @@ class GPUHistEvaluator {
   * \brief Reset the evaluator, should be called before any use.
   */
  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
-             bst_feature_t n_features, TrainParam const &param, int32_t device);
+             bst_feature_t n_features, TrainParam const &param, bool is_column_split,
+             int32_t device);

  /**
   * \brief Get host category storage for nidx.  Different from the internal version, this
   *        returns strictly 1 node.
   */
-  common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
+  [[nodiscard]] common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
    copy_stream_.View().Sync();
    auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
        nidx * node_categorical_storage_size_, node_categorical_storage_size_);
    return cats_out;
  }
+
+  [[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
+    copy_stream_.View().Sync();
+    if (has_categoricals_) {
+      CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
+      return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
+    } else {
+      return common::KCatBitField{};
+    }
+  }
  /**
   * \brief Add a split to the internal tree evaluator.
   */
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -14,10 +14,9 @@

 namespace xgboost {
 namespace tree {
-void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
-                                           common::Span<FeatureType const> ft,
-                                           bst_feature_t n_features, TrainParam const &param,
-                                           int32_t device) {
+void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
+                             bst_feature_t n_features, TrainParam const &param,
+                             bool is_column_split, int32_t device) {
  param_ = param;
  tree_evaluator_ = TreeEvaluator{param, n_features, device};
  has_categoricals_ = cuts.HasCategorical();
@@ -93,6 +92,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
                      });
 #endif
  }
+  is_column_split_ = is_column_split;
+  device_ = device;
 }

 common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -8,10 +8,10 @@
 #include <xgboost/logging.h>

 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <limits>
 #include <utility>

-#include "../../common/compressed_iterator.h"
 #include "../../common/cuda_context.cuh"  // for CUDAContext
 #include "../../common/random.h"
 #include "../param.h"
@@ -146,27 +146,30 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
  CombineGradientPair combine_;
 };

-NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
+NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}

-GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
+GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                       DMatrix* dmat) {
-  return {dmat->Info().num_row_, page_, gpair};
+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+  return {dmat->Info().num_row_, page, gpair};
 }

-ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
-                                                   size_t n_rows, BatchParam batch_param)
-    : batch_param_{std::move(batch_param)},
-      page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
-                                n_rows)) {}
+ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
+    : batch_param_{std::move(batch_param)} {}

 GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
                                                     common::Span<GradientPair> gpair,
                                                     DMatrix* dmat) {
  if (!page_concatenated_) {
    // Concatenate all the external memory ELLPACK pages into a single in-memory page.
+    page_.reset(nullptr);
    size_t offset = 0;
    for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
      auto page = batch.Impl();
+      if (!page_) {
+        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+                                                  page->row_stride, dmat->Info().num_row_);
+      }
      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
      offset += num_elements;
    }
@@ -175,8 +178,8 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
  return {dmat->Info().num_row_, page_.get(), gpair};
 }

-UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
-    : page_(page), subsample_(subsample) {}
+UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
+    : batch_param_{std::move(batch_param)}, subsample_(subsample) {}

 GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                            DMatrix* dmat) {
@@ -185,7 +188,8 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
  thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                     thrust::counting_iterator<std::size_t>(0),
                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
-  return {dmat->Info().num_row_, page_, gpair};
+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+  return {dmat->Info().num_row_, page, gpair};
 }

 ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
@@ -198,27 +202,27 @@ ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
 GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
                                                          common::Span<GradientPair> gpair,
                                                          DMatrix* dmat) {
+  auto cuctx = ctx->CUDACtx();
  // Set gradient pair to 0 with p = 1 - subsample
-  thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
-                     thrust::counting_iterator<size_t>(0),
-                     BernoulliTrial(common::GlobalRandom()(), subsample_),
-                     GradientPair());
+  thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                     thrust::counting_iterator<std::size_t>(0),
+                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair{});

  // Count the sampled rows.
-  size_t sample_rows = thrust::count_if(dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
+  size_t sample_rows =
+      thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero{});

  // Compact gradient pairs.
  gpair_.resize(sample_rows);
-  thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
+  thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero{});

  // Index the sample rows.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
-  thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    IsNonZero());
+  thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
                         sample_row_index_.begin());
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    sample_row_index_.begin(),
-                    sample_row_index_.begin(),
-                    ClearEmptyRows());
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    sample_row_index_.begin(), ClearEmptyRows());

  auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
  auto first_page = (*batch_iterator.begin()).Impl();
@@ -228,7 +232,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
                                  first_page->row_stride, sample_rows));

  // Compact the ELLPACK pages into the single sample page.
-  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
+  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
  for (auto& batch : batch_iterator) {
    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
  }
@@ -236,12 +240,10 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }

-GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
-                                             size_t n_rows,
-                                             const BatchParam&,
+GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batch_param,
                                             float subsample)
-    : page_(page),
-      subsample_(subsample),
+    : subsample_(subsample),
+      batch_param_{std::move(batch_param)},
      threshold_(n_rows + 1, 0.0f),
      grad_sum_(n_rows, 0.0f) {}

@@ -252,18 +254,19 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
      gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);

+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+
  // Perform Poisson sampling in place.
  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                    RandomWeight(common::GlobalRandom()())));
-  return {n_rows, page_, gpair};
+  return {n_rows, page, gpair};
 }

-ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
-    size_t n_rows,
-    BatchParam batch_param,
-    float subsample)
+ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
+                                                                         BatchParam batch_param,
+                                                                         float subsample)
    : batch_param_(std::move(batch_param)),
      subsample_(subsample),
      threshold_(n_rows + 1, 0.0f),
@@ -273,16 +276,15 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
 GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
                                                                common::Span<GradientPair> gpair,
                                                                DMatrix* dmat) {
-  size_t n_rows = dmat->Info().num_row_;
+  auto cuctx = ctx->CUDACtx();
+  bst_row_t n_rows = dmat->Info().num_row_;
  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
      gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);

  // Perform Poisson sampling in place.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    thrust::counting_iterator<size_t>(0),
-                    dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_),
-                                    threshold_index,
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
+                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                    RandomWeight(common::GlobalRandom()())));

  // Count the sampled rows.
@@ -290,16 +292,15 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c

  // Compact gradient pairs.
  gpair_.resize(sample_rows);
-  thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
+  thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());

  // Index the sample rows.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
-  thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
-    sample_row_index_.begin());
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    sample_row_index_.begin(),
-                    sample_row_index_.begin(),
-                    ClearEmptyRows());
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    IsNonZero());
+  thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
+                         sample_row_index_.begin());
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    sample_row_index_.begin(), ClearEmptyRows());

  auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
  auto first_page = (*batch_iterator.begin()).Impl();
@@ -317,13 +318,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }

-GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
-                                           size_t n_rows, const BatchParam& batch_param,
-                                           float subsample, int sampling_method) {
+GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
+                                           const BatchParam& batch_param, float subsample,
+                                           int sampling_method, bool is_external_memory) {
+  // The ctx is kept here for future development of stream-based operations.
  monitor_.Init("gradient_based_sampler");

  bool is_sampling = subsample < 1.0;
-  bool is_external_memory = page->n_rows != n_rows;

  if (is_sampling) {
    switch (sampling_method) {
@@ -331,24 +332,24 @@ GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl c
        if (is_external_memory) {
          strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
        } else {
-          strategy_.reset(new UniformSampling(page, subsample));
+          strategy_.reset(new UniformSampling(batch_param, subsample));
        }
        break;
      case TrainParam::kGradientBased:
        if (is_external_memory) {
-          strategy_.reset(
-              new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
+          strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
        } else {
-          strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
+          strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
        }
        break;
-      default:LOG(FATAL) << "unknown sampling method";
+      default:
+        LOG(FATAL) << "unknown sampling method";
    }
  } else {
    if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
+      strategy_.reset(new ExternalMemoryNoSampling(batch_param));
    } else {
-      strategy_.reset(new NoSampling(page));
+      strategy_.reset(new NoSampling(batch_param));
    }
  }
 }
@@ -362,11 +363,11 @@ GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
  return sample;
 }

-size_t GradientBasedSampler::CalculateThresholdIndex(
-    common::Span<GradientPair> gpair, common::Span<float> threshold,
-    common::Span<float> grad_sum, size_t sample_rows) {
-  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold),
-               std::numeric_limits<float>::max());
+size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
+                                                     common::Span<float> threshold,
+                                                     common::Span<float> grad_sum,
+                                                     size_t sample_rows) {
+  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(threshold),
                    CombineGradientPair());
  thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
@@ -379,6 +380,5 @@ size_t GradientBasedSampler::CalculateThresholdIndex(
      thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
  return thrust::distance(dh::tbegin(grad_sum), min) + 1;
 }
-
 };  // namespace tree
 };  // namespace xgboost
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #pragma once
 #include <xgboost/base.h>
@@ -32,37 +32,36 @@ class SamplingStrategy {
 /*! \brief No sampling in in-memory mode. */
 class NoSampling : public SamplingStrategy {
 public:
-  explicit NoSampling(EllpackPageImpl const* page);
-  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
-                             DMatrix* dmat) override;
-
- private:
-  EllpackPageImpl const* page_;
-};
-
-/*! \brief No sampling in external memory mode. */
-class ExternalMemoryNoSampling : public SamplingStrategy {
- public:
-  ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
-                           BatchParam batch_param);
+  explicit NoSampling(BatchParam batch_param);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
  BatchParam batch_param_;
-  std::unique_ptr<EllpackPageImpl> page_;
+};
+
+/*! \brief No sampling in external memory mode. */
+class ExternalMemoryNoSampling : public SamplingStrategy {
+ public:
+  explicit ExternalMemoryNoSampling(BatchParam batch_param);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
+
+ private:
+  BatchParam batch_param_;
+  std::unique_ptr<EllpackPageImpl> page_{nullptr};
  bool page_concatenated_{false};
 };

 /*! \brief Uniform sampling in in-memory mode. */
 class UniformSampling : public SamplingStrategy {
 public:
-  UniformSampling(EllpackPageImpl const* page, float subsample);
+  UniformSampling(BatchParam batch_param, float subsample);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
-  EllpackPageImpl const* page_;
+  BatchParam batch_param_;
  float subsample_;
 };

@@ -84,13 +83,12 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
 /*! \brief Gradient-based sampling in in-memory mode.. */
 class GradientBasedSampling : public SamplingStrategy {
 public:
-  GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
-                        float subsample);
+  GradientBasedSampling(std::size_t n_rows, BatchParam batch_param, float subsample);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
-  EllpackPageImpl const* page_;
+  BatchParam batch_param_;
  float subsample_;
  dh::caching_device_vector<float> threshold_;
  dh::caching_device_vector<float> grad_sum_;
@@ -106,11 +104,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
 private:
  BatchParam batch_param_;
  float subsample_;
-  dh::caching_device_vector<float> threshold_;
-  dh::caching_device_vector<float> grad_sum_;
+  dh::device_vector<float> threshold_;
+  dh::device_vector<float> grad_sum_;
  std::unique_ptr<EllpackPageImpl> page_;
  dh::device_vector<GradientPair> gpair_;
-  dh::caching_device_vector<size_t> sample_row_index_;
+  dh::device_vector<size_t> sample_row_index_;
 };

 /*! \brief Draw a sample of rows from a DMatrix.
@@ -124,8 +122,8 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
 */
 class GradientBasedSampler {
 public:
-  GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
-                       const BatchParam& batch_param, float subsample, int sampling_method);
+  GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
+                       float subsample, int sampling_method, bool is_external_memory);

  /*! \brief Sample from a DMatrix based on the given gradient pairs. */
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -8,6 +8,7 @@
 #include <cstdint>  // uint32_t
 #include <limits>

+#include "../../collective/aggregator.h"
 #include "../../common/deterministic.cuh"
 #include "../../common/device_helpers.cuh"
 #include "../../data/ellpack_page.cuh"
@@ -52,7 +53,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
 *
 * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
 */
-GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
+GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
  using GradientSumT = GradientPairPrecise;
  using T = typename GradientSumT::ValueT;
  dh::XGBCachingDeviceAllocator<char> alloc;
@@ -70,11 +71,11 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
  // Treat pair as array of 4 primitive types to allreduce
  using ReduceT = typename decltype(p.first)::ValueT;
  static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
-  collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<ReduceT*>(&p), 4);
+  collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
  GradientPair positive_sum{p.first}, negative_sum{p.second};

  std::size_t total_rows = gpair.size();
-  collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
+  collective::GlobalSum(info, &total_rows, 1);

  auto histogram_rounding =
      GradientSumT{common::CreateRoundingFactor<T>(
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -39,7 +39,7 @@ private:
  GradientPairPrecise to_floating_point_;

 public:
-  explicit GradientQuantiser(common::Span<GradientPair const> gpair);
+  GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
                               gpair.GetHess() * to_fixed_point_.GetHess());
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -24,21 +24,13 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)

  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
-
-#if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaStreamCreate(&stream_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipStreamCreate(&stream_));
-#endif
 }

 RowPartitioner::~RowPartitioner() {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx_));
-  dh::safe_cuda(cudaStreamDestroy(stream_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_idx_));
-  dh::safe_cuda(hipStreamDestroy(stream_));
 #endif
 }

--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -116,13 +116,7 @@ template <typename RowIndexT, typename OpT, typename OpDataT>
 void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
                       common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
-                       dh::device_vector<int8_t>* tmp,
-#if defined(XGBOOST_USE_HIP)
-                       hipStream_t stream
-#elif defined(XGBOOST_USE_CUDA)
-                       cudaStream_t stream
-#endif
-                       ) {
+                       dh::device_vector<int8_t>* tmp) {
  dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
  WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
                                             d_counts.data()};
@@ -135,29 +129,28 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
        int batch_idx;
        std::size_t item_idx;
        AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
-        auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
+        auto op_res = op(ridx[item_idx], batch_idx, batch_info_itr[batch_idx].data);
        return IndexFlagTuple{static_cast<bst_uint>(item_idx), op_res, batch_idx, op_res};
      });
  size_t temp_bytes = 0;
  if (tmp->empty()) {
 #if defined(XGBOOST_USE_CUDA)
    cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                   IndexFlagOp(), total_rows, stream);
+                                   IndexFlagOp(), total_rows);
 #elif defined(XGBOOST_USE_HIP)
    rocprim::inclusive_scan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                   total_rows, IndexFlagOp(), stream);
+                                   total_rows,IndexFlagOp());
 #endif
-
    tmp->resize(temp_bytes);
  }
  temp_bytes = tmp->size();

 #if defined(XGBOOST_USE_CUDA)
  cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(), total_rows, stream);
+                                 discard_write_iterator, IndexFlagOp(), total_rows);
 #elif defined(XGBOOST_USE_HIP)
-  rocprim::inclusive_scan(tmp->data().get(), temp_bytes, input_iterator, discard_write_iterator,
-                                   total_rows, IndexFlagOp(), stream);
+  rocprim::inclusive_scan(tmp->data().get(), temp_bytes, input_iterator,
+                                 discard_write_iterator, total_rows, IndexFlagOp());
 #endif

  constexpr int kBlockSize = 256;
@@ -167,7 +160,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
  const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);

  SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
-      <<<grid_size, kBlockSize, 0, stream>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
+      <<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
 }

 struct NodePositionInfo {
@@ -240,12 +233,6 @@ class RowPartitioner {
  dh::PinnedMemory pinned_;
  dh::PinnedMemory pinned2_;

-#if defined(XGBOOST_USE_HIP)
-  hipStream_t stream_;
-#else
-  cudaStream_t stream_;
-#endif
-
 public:
  RowPartitioner(int device_idx, size_t num_rows);
  ~RowPartitioner();
@@ -303,11 +290,11 @@ class RowPartitioner {
 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  hipMemcpyDefault, stream_));
+                                  hipMemcpyDefault));
 #else
    dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
 #endif

    // Temporary arrays
@@ -317,23 +304,17 @@ class RowPartitioner {
    // Partition the rows according to the operator
    SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
        dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
-        total_rows, op, &tmp_, stream_);
-
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
-                                  hipMemcpyDefault, stream_));
-#else
+        total_rows, op, &tmp_);
+#if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
+#elif defined(XGBOOST_USE_HIP)
+    dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
+                                  hipMemcpyDefault));
 #endif
-
    // TODO(Rory): this synchronisation hurts performance a lot
    // Future optimisation should find a way to skip this
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipStreamSynchronize(stream_));
-#else
-    dh::safe_cuda(cudaStreamSynchronize(stream_));
-#endif
+    dh::DefaultStream().Sync();

    // Update segments
    for (size_t i = 0; i < nidx.size(); i++) {
@@ -370,18 +351,18 @@ class RowPartitioner {
 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                  sizeof(NodePositionInfo) * ridx_segments_.size(),
-                                  hipMemcpyDefault, stream_));
+                                  hipMemcpyDefault));
 #else
    dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                  sizeof(NodePositionInfo) * ridx_segments_.size(),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
 #endif

    constexpr int kBlockSize = 512;
    const int kItemsThread = 8;
    const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
    common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
-    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
+    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
        dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
  }
 };
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -4,13 +4,13 @@
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_

-#include <algorithm>                   // for copy
-#include <cstddef>                     // for size_t
-#include <limits>                      // for numeric_limits
-#include <memory>                      // for shared_ptr
-#include <numeric>                     // for accumulate
-#include <utility>                     // for move
-#include <vector>                      // for vector
+#include <algorithm>  // for copy
+#include <cstddef>    // for size_t
+#include <limits>     // for numeric_limits
+#include <memory>     // for shared_ptr
+#include <numeric>    // for accumulate
+#include <utility>    // for move
+#include <vector>     // for vector

 #include "../../common/categorical.h"  // for CatBitField
 #include "../../common/hist_util.h"    // for GHistRow, HistogramCuts
@@ -20,6 +20,7 @@
 #include "../param.h"                  // for TrainParam
 #include "../split_evaluator.h"        // for TreeEvaluator
 #include "expand_entry.h"              // for MultiExpandEntry
+#include "hist_cache.h"                // for BoundedHistCollection
 #include "xgboost/base.h"              // for bst_node_t, bst_target_t, bst_feature_t
 #include "xgboost/context.h"           // for COntext
 #include "xgboost/linalg.h"            // for Constants, Vector
@@ -65,7 +66,7 @@ class HistEvaluator {
   *        pseudo-category for missing value but here we just do a complete scan to avoid
   *        making specialized histogram bin.
   */
-  void EnumerateOneHot(common::HistogramCuts const &cut, const common::GHistRow &hist,
+  void EnumerateOneHot(common::HistogramCuts const &cut, common::ConstGHistRow hist,
                       bst_feature_t fidx, bst_node_t nidx,
                       TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
                       SplitEntry *p_best) const {
@@ -143,7 +144,7 @@ class HistEvaluator {
   */
  template <int d_step>
  void EnumeratePart(common::HistogramCuts const &cut, common::Span<size_t const> sorted_idx,
-                     common::GHistRow const &hist, bst_feature_t fidx, bst_node_t nidx,
+                     common::ConstGHistRow hist, bst_feature_t fidx, bst_node_t nidx,
                     TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
                     SplitEntry *p_best) {
    static_assert(d_step == +1 || d_step == -1, "Invalid step.");
@@ -214,7 +215,7 @@ class HistEvaluator {
  // Returns the sum of gradients corresponding to the data points that contains
  // a non-missing value for the particular feature fid.
  template <int d_step>
-  GradStats EnumerateSplit(common::HistogramCuts const &cut, const common::GHistRow &hist,
+  GradStats EnumerateSplit(common::HistogramCuts const &cut, common::ConstGHistRow hist,
                           bst_feature_t fidx, bst_node_t nidx,
                           TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
                           SplitEntry *p_best) const {
@@ -317,7 +318,7 @@ class HistEvaluator {
  }

 public:
-  void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
+  void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
                      common::Span<FeatureType const> feature_types, const RegTree &tree,
                      std::vector<CPUExpandEntry> *p_entries) {
    auto n_threads = ctx_->Threads();
@@ -454,8 +455,8 @@ class HistEvaluator {
                                   right_child);
  }

-  auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
-  auto const& Stats() const { return snode_; }
+  [[nodiscard]] auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
+  [[nodiscard]] auto const &Stats() const { return snode_; }

  float InitRoot(GradStats const &root_sum) {
    snode_.resize(1);
@@ -510,7 +511,7 @@ class HistMultiEvaluator {

  template <bst_bin_t d_step>
  bool EnumerateSplit(common::HistogramCuts const &cut, bst_feature_t fidx,
-                      common::Span<common::GHistRow const> hist,
+                      common::Span<common::ConstGHistRow> hist,
                      linalg::VectorView<GradientPairPrecise const> parent_sum, double parent_gain,
                      SplitEntryContainer<std::vector<GradientPairPrecise>> *p_best) const {
    auto const &cut_ptr = cut.Ptrs();
@@ -623,7 +624,7 @@ class HistMultiEvaluator {
  }

 public:
-  void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
+  void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
                      common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
    auto &entries = *p_entries;
    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
@@ -651,9 +652,9 @@ class HistMultiEvaluator {
      auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
      auto best = &entry->split;
      auto parent_sum = stats_.Slice(entry->nid, linalg::All());
-      std::vector<common::GHistRow> node_hist;
+      std::vector<common::ConstGHistRow> node_hist;
      for (auto t_hist : hist) {
-        node_hist.push_back((*t_hist)[entry->nid]);
+        node_hist.emplace_back((*t_hist)[entry->nid]);
      }
      auto features_set = features[nidx_in_set]->ConstHostSpan();

@@ -773,7 +774,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
                               std::vector<Partitioner> const &partitioner,
                               linalg::VectorView<float> out_preds) {
  auto const &tree = *p_last_tree;
-  CHECK_EQ(out_preds.DeviceIdx(), Context::kCpuId);
+  CHECK(out_preds.Device().IsCPU());
  size_t n_nodes = p_last_tree->GetNodes().size();
  for (auto &part : partitioner) {
    CHECK_EQ(part.Size(), n_nodes);
@@ -808,7 +809,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
  auto n_nodes = mttree->Size();
  auto n_targets = tree.NumTargets();
  CHECK_EQ(out_preds.Shape(1), n_targets);
-  CHECK_EQ(out_preds.DeviceIdx(), Context::kCpuId);
+  CHECK(out_preds.Device().IsCPU());

  for (auto &part : partitioner) {
    CHECK_EQ(part.Size(), n_nodes);
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -18,8 +18,8 @@ namespace xgboost::tree {
 */
 template <typename Impl>
 struct ExpandEntryImpl {
-  bst_node_t nid;
-  bst_node_t depth;
+  bst_node_t nid{0};
+  bst_node_t depth{0};

  [[nodiscard]] float GetLossChange() const {
    return static_cast<Impl const*>(this)->split.loss_chg;
--- a/src/tree/hist/hist_cache.h
+++ b/src/tree/hist/hist_cache.h
@@ -0,0 +1,113 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
+#define XGBOOST_TREE_HIST_HIST_CACHE_H_
+#include <cstddef>  // for size_t
+#include <map>      // for map
+#include <memory>   // for unique_ptr
+#include <vector>   // for vector
+
+#include "../../common/hist_util.h"          // for GHistRow, ConstGHistRow
+#include "../../common/ref_resource_view.h"  // for ReallocVector
+#include "xgboost/base.h"                    // for bst_node_t, bst_bin_t
+#include "xgboost/logging.h"                 // for CHECK_GT
+#include "xgboost/span.h"                    // for Span
+
+namespace xgboost::tree {
+/**
+ * @brief A persistent cache for CPU histogram.
+ *
+ *   The size of the cache is first bounded by the `Driver` class then by this cache
+ *   implementaiton. The former limits the number of nodes that can be built for each node
+ *   batch, while this cache limits the number of all nodes up to the size of
+ *   max(|node_batch|, n_cached_node).
+ *
+ *   The caller is responsible for clearing up the cache as it needs to rearrange the
+ *   nodes before making overflowed allocations. The strcut only reports whether the size
+ *   limit has benn reached.
+ */
+class BoundedHistCollection {
+  // maps node index to offset in `data_`.
+  std::map<bst_node_t, std::size_t> node_map_;
+  // currently allocated bins, used for tracking consistentcy.
+  std::size_t current_size_{0};
+
+  // stores the histograms in a contiguous buffer
+  using Vec = common::ReallocVector<GradientPairPrecise>;
+  std::unique_ptr<Vec> data_{new Vec{}};  // nvcc 12.1 trips over std::make_unique
+
+  // number of histogram bins across all features
+  bst_bin_t n_total_bins_{0};
+  // limits the number of nodes that can be in the cache for each tree
+  std::size_t n_cached_nodes_{0};
+  // whether the tree has grown beyond the cache limit
+  bool has_exceeded_{false};
+
+ public:
+  BoundedHistCollection() = default;
+  common::GHistRow operator[](std::size_t idx) {
+    auto offset = node_map_.at(idx);
+    return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
+  }
+  common::ConstGHistRow operator[](std::size_t idx) const {
+    auto offset = node_map_.at(idx);
+    return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
+  }
+  void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
+    n_total_bins_ = n_total_bins;
+    n_cached_nodes_ = n_cached_nodes;
+    this->Clear(false);
+  }
+  /**
+   * @brief Clear the cache, mark whether the cache is exceeded the limit.
+   */
+  void Clear(bool exceeded) {
+    node_map_.clear();
+    current_size_ = 0;
+    has_exceeded_ = exceeded;
+  }
+
+  [[nodiscard]] bool CanHost(common::Span<bst_node_t const> nodes_to_build,
+                             common::Span<bst_node_t const> nodes_to_sub) const {
+    auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
+    return n_new_nodes + node_map_.size() <= n_cached_nodes_;
+  }
+
+  /**
+   * @brief Allocate histogram buffers for all nodes.
+   *
+   *   The resulting histogram buffer is contiguous for all nodes in the order of
+   *   allocation.
+   */
+  void AllocateHistograms(common::Span<bst_node_t const> nodes_to_build,
+                          common::Span<bst_node_t const> nodes_to_sub) {
+    auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
+    auto alloc_size = n_new_nodes * n_total_bins_;
+    auto new_size = alloc_size + current_size_;
+    if (new_size > data_->size()) {
+      data_->Resize(new_size);
+    }
+    for (auto nidx : nodes_to_build) {
+      node_map_[nidx] = current_size_;
+      current_size_ += n_total_bins_;
+    }
+    for (auto nidx : nodes_to_sub) {
+      node_map_[nidx] = current_size_;
+      current_size_ += n_total_bins_;
+    }
+    CHECK_EQ(current_size_, new_size);
+  }
+  void AllocateHistograms(std::vector<bst_node_t> const& nodes) {
+    this->AllocateHistograms(common::Span<bst_node_t const>{nodes},
+                             common::Span<bst_node_t const>{});
+  }
+
+  [[nodiscard]] bool HasExceeded() const { return has_exceeded_; }
+  [[nodiscard]] bool HistogramExists(bst_node_t nidx) const {
+    return node_map_.find(nidx) != node_map_.cend();
+  }
+  [[nodiscard]] std::size_t Size() const { return current_size_; }
+};
+}  // namespace xgboost::tree
+#endif  // XGBOOST_TREE_HIST_HIST_CACHE_H_
--- a/src/tree/hist/histogram.cc
+++ b/src/tree/hist/histogram.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include "histogram.h"
+
+#include <cstddef>  // for size_t
+#include <numeric>  // for accumulate
+#include <utility>  // for swap
+#include <vector>   // for vector
+
+#include "../../common/transform_iterator.h"  // for MakeIndexTransformIter
+#include "expand_entry.h"                     // for MultiExpandEntry, CPUExpandEntry
+#include "xgboost/logging.h"                  // for CHECK_NE
+#include "xgboost/span.h"                     // for Span
+#include "xgboost/tree_model.h"               // for RegTree
+
+namespace xgboost::tree {
+void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
+  CHECK_EQ(nodes_to_build.size(), valid_candidates.size());
+
+  std::size_t n_idx = 0;
+  for (auto const &c : valid_candidates) {
+    auto left_nidx = p_tree->LeftChild(c.nid);
+    auto right_nidx = p_tree->RightChild(c.nid);
+
+    auto build_nidx = left_nidx;
+    auto subtract_nidx = right_nidx;
+    auto lit =
+        common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
+    auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
+    auto rit =
+        common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
+    auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
+    auto fewer_right = right_sum < left_sum;
+    if (fewer_right) {
+      std::swap(build_nidx, subtract_nidx);
+    }
+    nodes_to_build[n_idx] = build_nidx;
+    nodes_to_sub[n_idx] = subtract_nidx;
+    ++n_idx;
+  }
+}
+
+void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
+  std::size_t n_idx = 0;
+  for (auto const &c : candidates) {
+    auto left_nidx = (*p_tree)[c.nid].LeftChild();
+    auto right_nidx = (*p_tree)[c.nid].RightChild();
+    auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
+
+    auto build_nidx = left_nidx;
+    auto subtract_nidx = right_nidx;
+    if (fewer_right) {
+      std::swap(build_nidx, subtract_nidx);
+    }
+    nodes_to_build[n_idx] = build_nidx;
+    nodes_to_sub[n_idx] = subtract_nidx;
+    ++n_idx;
+  }
+}
+}  // namespace xgboost::tree
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -4,316 +4,229 @@
 #ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
 #define XGBOOST_TREE_HIST_HISTOGRAM_H_

-#include <algorithm>
-#include <limits>
-#include <vector>
+#include <algorithm>   // for max
+#include <cstddef>     // for size_t
+#include <cstdint>     // for int32_t
+#include <functional>  // for function
+#include <utility>     // for move
+#include <vector>      // for vector

-#include "../../collective/communicator-inl.h"
-#include "../../common/hist_util.h"
-#include "../../data/gradient_index.h"
-#include "expand_entry.h"
-#include "xgboost/tree_model.h"  // for RegTree
+#include "../../collective/communicator-inl.h"  // for Allreduce
+#include "../../collective/communicator.h"      // for Operation
+#include "../../common/hist_util.h"             // for GHistRow, ParallelGHi...
+#include "../../common/row_set.h"               // for RowSetCollection
+#include "../../common/threading_utils.h"       // for ParallelFor2d, Range1d, BlockedSpace2d
+#include "../../data/gradient_index.h"          // for GHistIndexMatrix
+#include "expand_entry.h"                       // for MultiExpandEntry, CPUExpandEntry
+#include "hist_cache.h"                         // for BoundedHistCollection
+#include "param.h"                              // for HistMakerTrainParam
+#include "xgboost/base.h"                       // for bst_node_t, bst_target_t, bst_bin_t
+#include "xgboost/context.h"                    // for Context
+#include "xgboost/data.h"                       // for BatchIterator, BatchSet
+#include "xgboost/linalg.h"                     // for MatrixView, All, Vect...
+#include "xgboost/logging.h"                    // for CHECK_GE
+#include "xgboost/span.h"                       // for Span
+#include "xgboost/tree_model.h"                 // for RegTree
+
+namespace xgboost::tree {
+/**
+ * @brief Decide which node as the build node for multi-target trees.
+ */
+void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
+
+/**
+ * @brief Decide which node as the build node.
+ */
+void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);

-namespace xgboost {
-namespace tree {
-template <typename ExpandEntry>
 class HistogramBuilder {
  /*! \brief culmulative histogram of gradients. */
-  common::HistCollection hist_;
-  /*! \brief culmulative local parent histogram of gradients. */
-  common::HistCollection hist_local_worker_;
-  common::GHistBuilder builder_;
+  BoundedHistCollection hist_;
  common::ParallelGHistBuilder buffer_;
  BatchParam param_;
  int32_t n_threads_{-1};
-  size_t n_batches_{0};
  // Whether XGBoost is running in distributed environment.
  bool is_distributed_{false};
  bool is_col_split_{false};

 public:
  /**
-   * \param total_bins       Total number of bins across all features
-   * \param max_bin_per_feat Maximum number of bins per feature, same as the `max_bin`
-   *                         training parameter.
-   * \param n_threads        Number of threads.
-   * \param is_distributed   Mostly used for testing to allow injecting parameters instead
+   * @brief Reset the builder, should be called before growing a new tree.
+   *
+   * @param total_bins       Total number of bins across all features
+   * @param is_distributed   Mostly used for testing to allow injecting parameters instead
   *                         of using global rabit variable.
   */
-  void Reset(uint32_t total_bins, BatchParam p, int32_t n_threads, size_t n_batches,
-             bool is_distributed, bool is_col_split) {
-    CHECK_GE(n_threads, 1);
-    n_threads_ = n_threads;
-    n_batches_ = n_batches;
+  void Reset(Context const *ctx, bst_bin_t total_bins, BatchParam const &p, bool is_distributed,
+             bool is_col_split, HistMakerTrainParam const *param) {
+    n_threads_ = ctx->Threads();
    param_ = p;
-    hist_.Init(total_bins);
-    hist_local_worker_.Init(total_bins);
+    hist_.Reset(total_bins, param->max_cached_hist_node);
    buffer_.Init(total_bins);
-    builder_ = common::GHistBuilder(total_bins);
    is_distributed_ = is_distributed;
    is_col_split_ = is_col_split;
-    // Workaround s390x gcc 7.5.0
-    auto DMLC_ATTRIBUTE_UNUSED __force_instantiation = &GradientPairPrecise::Reduce;
  }

  template <bool any_missing>
-  void BuildLocalHistograms(size_t page_idx, common::BlockedSpace2d space,
-                            GHistIndexMatrix const &gidx,
-                            std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
+  void BuildLocalHistograms(common::BlockedSpace2d const &space, GHistIndexMatrix const &gidx,
+                            std::vector<bst_node_t> const &nodes_to_build,
                            common::RowSetCollection const &row_set_collection,
                            common::Span<GradientPair const> gpair_h, bool force_read_by_column) {
-    const size_t n_nodes = nodes_for_explicit_hist_build.size();
-    CHECK_GT(n_nodes, 0);
-
-    std::vector<common::GHistRow> target_hists(n_nodes);
-    for (size_t i = 0; i < n_nodes; ++i) {
-      auto const nidx = nodes_for_explicit_hist_build[i].nid;
-      target_hists[i] = hist_[nidx];
-    }
-    if (page_idx == 0) {
-      // FIXME(jiamingy): Handle different size of space.  Right now we use the maximum
-      // partition size for the buffer, which might not be efficient if partition sizes
-      // has significant variance.
-      buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
-    }
-
    // Parallel processing by nodes and data in each node
    common::ParallelFor2d(space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
      const auto tid = static_cast<unsigned>(omp_get_thread_num());
-      const int32_t nid = nodes_for_explicit_hist_build[nid_in_set].nid;
-      auto elem = row_set_collection[nid];
+      bst_node_t const nidx = nodes_to_build[nid_in_set];
+      auto elem = row_set_collection[nidx];
      auto start_of_row_set = std::min(r.begin(), elem.Size());
      auto end_of_row_set = std::min(r.end(), elem.Size());
      auto rid_set = common::RowSetCollection::Elem(elem.begin + start_of_row_set,
-                                                    elem.begin + end_of_row_set, nid);
+                                                    elem.begin + end_of_row_set, nidx);
      auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
      if (rid_set.Size() != 0) {
-        builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist,
-                                                 force_read_by_column);
+        common::BuildHist<any_missing>(gpair_h, rid_set, gidx, hist, force_read_by_column);
      }
    });
  }

-  void AddHistRows(int *starting_index, int *sync_count,
-                   std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                   std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                   RegTree const *p_tree) {
-    if (is_distributed_ && !is_col_split_) {
-      this->AddHistRowsDistributed(starting_index, sync_count, nodes_for_explicit_hist_build,
-                                   nodes_for_subtraction_trick, p_tree);
-    } else {
-      this->AddHistRowsLocal(starting_index, sync_count, nodes_for_explicit_hist_build,
-                             nodes_for_subtraction_trick);
-    }
-  }
+  /**
+   * @brief Allocate histogram, rearrange the nodes if `rearrange` is true and the tree
+   *        has reached the cache size limit.
+   */
+  void AddHistRows(RegTree const *p_tree, std::vector<bst_node_t> *p_nodes_to_build,
+                   std::vector<bst_node_t> *p_nodes_to_sub, bool rearrange) {
+    CHECK(p_nodes_to_build);
+    auto &nodes_to_build = *p_nodes_to_build;
+    CHECK(p_nodes_to_sub);
+    auto &nodes_to_sub = *p_nodes_to_sub;

-  /** Main entry point of this class, build histogram for tree nodes. */
-  void BuildHist(size_t page_id, common::BlockedSpace2d space, GHistIndexMatrix const &gidx,
-                 RegTree const *p_tree, common::RowSetCollection const &row_set_collection,
-                 std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                 std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                 common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
-    int starting_index = std::numeric_limits<int>::max();
-    int sync_count = 0;
-    if (page_id == 0) {
-      this->AddHistRows(&starting_index, &sync_count, nodes_for_explicit_hist_build,
-                        nodes_for_subtraction_trick, p_tree);
-    }
-    if (gidx.IsDense()) {
-      this->BuildLocalHistograms<false>(page_id, space, gidx, nodes_for_explicit_hist_build,
-                                        row_set_collection, gpair, force_read_by_column);
-    } else {
-      this->BuildLocalHistograms<true>(page_id, space, gidx, nodes_for_explicit_hist_build,
-                                       row_set_collection, gpair, force_read_by_column);
+    // We first check whether the cache size is already exceeded or about to be exceeded.
+    // If not, then we can allocate histograms without clearing the cache and without
+    // worrying about missing parent histogram.
+    //
+    // Otherwise, we need to rearrange the nodes before the allocation to make sure the
+    // resulting buffer is contiguous. This is to facilitate efficient allreduce.
+
+    bool can_host = this->hist_.CanHost(nodes_to_build, nodes_to_sub);
+    // True if the tree is still within the size of cache limit. Allocate histogram as
+    // usual.
+    auto cache_is_valid = can_host && !this->hist_.HasExceeded();
+
+    if (!can_host) {
+      this->hist_.Clear(true);
    }

-    CHECK_GE(n_batches_, 1);
-    if (page_id != n_batches_ - 1) {
+    if (!rearrange || cache_is_valid) {
+      // If not rearrange, we allocate the histogram as usual, assuming the nodes have
+      // been properly arranged by other builders.
+      this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
+      if (rearrange) {
+        CHECK(!this->hist_.HasExceeded());
+      }
      return;
    }

-    if (is_distributed_ && !is_col_split_) {
-      this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
-                                     nodes_for_subtraction_trick,
-                                     starting_index, sync_count);
+    // The cache is full, parent histogram might be removed in previous iterations to
+    // saved memory.
+    std::vector<bst_node_t> can_subtract;
+    for (auto const &v : nodes_to_sub) {
+      if (this->hist_.HistogramExists(p_tree->Parent(v))) {
+        // We can still use the subtraction trick for this node
+        can_subtract.push_back(v);
+      } else {
+        // This node requires a full build
+        nodes_to_build.push_back(v);
+      }
+    }
+
+    nodes_to_sub = std::move(can_subtract);
+    this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
+  }
+
+  /** Main entry point of this class, build histogram for tree nodes. */
+  void BuildHist(std::size_t page_idx, common::BlockedSpace2d const &space,
+                 GHistIndexMatrix const &gidx, common::RowSetCollection const &row_set_collection,
+                 std::vector<bst_node_t> const &nodes_to_build,
+                 linalg::VectorView<GradientPair const> gpair, bool force_read_by_column = false) {
+    CHECK(gpair.Contiguous());
+
+    if (page_idx == 0) {
+      // Add the local histogram cache to the parallel buffer before processing the first page.
+      auto n_nodes = nodes_to_build.size();
+      std::vector<common::GHistRow> target_hists(n_nodes);
+      for (size_t i = 0; i < n_nodes; ++i) {
+        auto const nidx = nodes_to_build[i];
+        target_hists[i] = hist_[nidx];
+      }
+      buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
+    }
+
+    if (gidx.IsDense()) {
+      this->BuildLocalHistograms<false>(space, gidx, nodes_to_build, row_set_collection,
+                                        gpair.Values(), force_read_by_column);
    } else {
-      this->SyncHistogramLocal(p_tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
+      this->BuildLocalHistograms<true>(space, gidx, nodes_to_build, row_set_collection,
+                                       gpair.Values(), force_read_by_column);
    }
  }
-  /** same as the other build hist but handles only single batch data (in-core) */
-  void BuildHist(size_t page_id, GHistIndexMatrix const &gidx, RegTree *p_tree,
-                 common::RowSetCollection const &row_set_collection,
-                 std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                 std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                 common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
-    const size_t n_nodes = nodes_for_explicit_hist_build.size();
-    // create space of size (# rows in each node)
+
+  void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
+                     std::vector<bst_node_t> const &nodes_to_trick) {
+    auto n_total_bins = buffer_.TotalBins();
    common::BlockedSpace2d space(
-        n_nodes,
-        [&](size_t nidx_in_set) {
-          const int32_t nidx = nodes_for_explicit_hist_build[nidx_in_set].nid;
-          return row_set_collection[nidx].Size();
-        },
-        256);
-    this->BuildHist(page_id, space, gidx, p_tree, row_set_collection, nodes_for_explicit_hist_build,
-                    nodes_for_subtraction_trick, gpair, force_read_by_column);
-  }
-
-  void SyncHistogramDistributed(RegTree const *p_tree,
-                                std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                                std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                                int starting_index, int sync_count) {
-    const size_t nbins = builder_.GetNumBins();
-    common::BlockedSpace2d space(
-        nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
-    common::ParallelFor2d(space, n_threads_, [&](size_t node, common::Range1d r) {
-      const auto &entry = nodes_for_explicit_hist_build[node];
-      auto this_hist = this->hist_[entry.nid];
-      // Merging histograms from each thread into once
-      buffer_.ReduceHist(node, r.begin(), r.end());
-      // Store posible parent node
-      auto this_local = hist_local_worker_[entry.nid];
-      common::CopyHist(this_local, this_hist, r.begin(), r.end());
-
-      if (!p_tree->IsRoot(entry.nid)) {
-        const size_t parent_id = p_tree->Parent(entry.nid);
-        const int subtraction_node_id = nodes_for_subtraction_trick[node].nid;
-        auto parent_hist = this->hist_local_worker_[parent_id];
-        auto sibling_hist = this->hist_[subtraction_node_id];
-        common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
-        // Store posible parent node
-        auto sibling_local = hist_local_worker_[subtraction_node_id];
-        common::CopyHist(sibling_local, sibling_hist, r.begin(), r.end());
-      }
-    });
-
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<double *>(this->hist_[starting_index].data()),
-        builder_.GetNumBins() * sync_count * 2);
-
-    ParallelSubtractionHist(space, nodes_for_explicit_hist_build, nodes_for_subtraction_trick,
-                            p_tree);
-
-    common::BlockedSpace2d space2(
-        nodes_for_subtraction_trick.size(), [&](size_t) { return nbins; }, 1024);
-    ParallelSubtractionHist(space2, nodes_for_subtraction_trick, nodes_for_explicit_hist_build,
-                            p_tree);
-  }
-
-  void SyncHistogramLocal(RegTree const *p_tree,
-                          std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                          std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
-    const size_t nbins = this->builder_.GetNumBins();
-    common::BlockedSpace2d space(
-        nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
-
+        nodes_to_build.size(), [&](std::size_t) { return n_total_bins; }, 1024);
    common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
-      const auto &entry = nodes_for_explicit_hist_build[node];
-      auto this_hist = this->hist_[entry.nid];
-      // Merging histograms from each thread into once
+      // Merging histograms from each thread.
      this->buffer_.ReduceHist(node, r.begin(), r.end());
-
-      if (!p_tree->IsRoot(entry.nid)) {
-        auto const parent_id = p_tree->Parent(entry.nid);
-        auto const subtraction_node_id = nodes_for_subtraction_trick[node].nid;
-        auto parent_hist = this->hist_[parent_id];
-        auto sibling_hist = this->hist_[subtraction_node_id];
-        common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
-      }
    });
+    if (is_distributed_ && !is_col_split_) {
+      // The cache is contiguous, we can perform allreduce for all nodes in one go.
+      CHECK(!nodes_to_build.empty());
+      auto first_nidx = nodes_to_build.front();
+      std::size_t n = n_total_bins * nodes_to_build.size() * 2;
+      collective::Allreduce<collective::Operation::kSum>(
+          reinterpret_cast<double *>(this->hist_[first_nidx].data()), n);
+    }
+
+    common::BlockedSpace2d const &subspace =
+        nodes_to_trick.size() == nodes_to_build.size()
+            ? space
+            : common::BlockedSpace2d{nodes_to_trick.size(),
+                                     [&](std::size_t) { return n_total_bins; }, 1024};
+    common::ParallelFor2d(
+        subspace, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
+          auto subtraction_nidx = nodes_to_trick[nidx_in_set];
+          auto parent_id = p_tree->Parent(subtraction_nidx);
+          auto sibling_nidx = p_tree->IsLeftChild(subtraction_nidx) ? p_tree->RightChild(parent_id)
+                                                                    : p_tree->LeftChild(parent_id);
+          auto sibling_hist = this->hist_[sibling_nidx];
+          auto parent_hist = this->hist_[parent_id];
+          auto subtract_hist = this->hist_[subtraction_nidx];
+          common::SubtractionHist(subtract_hist, parent_hist, sibling_hist, r.begin(), r.end());
+        });
  }

 public:
  /* Getters for tests. */
-  common::HistCollection const &Histogram() { return hist_; }
-  auto& Buffer() { return buffer_; }
-
- private:
-  void
-  ParallelSubtractionHist(const common::BlockedSpace2d &space,
-                          const std::vector<ExpandEntry> &nodes,
-                          const std::vector<ExpandEntry> &subtraction_nodes,
-                          const RegTree *p_tree) {
-    common::ParallelFor2d(
-        space, this->n_threads_, [&](size_t node, common::Range1d r) {
-          const auto &entry = nodes[node];
-          if (!(p_tree->IsLeftChild(entry.nid))) {
-            auto this_hist = this->hist_[entry.nid];
-
-            if (!p_tree->IsRoot(entry.nid)) {
-              const int subtraction_node_id = subtraction_nodes[node].nid;
-              auto parent_hist = hist_[(*p_tree)[entry.nid].Parent()];
-              auto sibling_hist = hist_[subtraction_node_id];
-              common::SubtractionHist(this_hist, parent_hist, sibling_hist,
-                                      r.begin(), r.end());
-            }
-          }
-        });
-  }
-
-  // Add a tree node to histogram buffer in local training environment.
-  void AddHistRowsLocal(
-      int *starting_index, int *sync_count,
-      std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-      std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
-    for (auto const &entry : nodes_for_explicit_hist_build) {
-      int nid = entry.nid;
-      this->hist_.AddHistRow(nid);
-      (*starting_index) = std::min(nid, (*starting_index));
-    }
-    (*sync_count) = nodes_for_explicit_hist_build.size();
-
-    for (auto const &node : nodes_for_subtraction_trick) {
-      this->hist_.AddHistRow(node.nid);
-    }
-    this->hist_.AllocateAllData();
-  }
-
-  void AddHistRowsDistributed(int *starting_index, int *sync_count,
-                              std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                              std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                              RegTree const *p_tree) {
-    const size_t explicit_size = nodes_for_explicit_hist_build.size();
-    const size_t subtaction_size = nodes_for_subtraction_trick.size();
-    std::vector<int> merged_node_ids(explicit_size + subtaction_size);
-    for (size_t i = 0; i < explicit_size; ++i) {
-      merged_node_ids[i] = nodes_for_explicit_hist_build[i].nid;
-    }
-    for (size_t i = 0; i < subtaction_size; ++i) {
-      merged_node_ids[explicit_size + i] = nodes_for_subtraction_trick[i].nid;
-    }
-    std::sort(merged_node_ids.begin(), merged_node_ids.end());
-    int n_left = 0;
-    for (auto const &nid : merged_node_ids) {
-      if (p_tree->IsLeftChild(nid)) {
-        this->hist_.AddHistRow(nid);
-        (*starting_index) = std::min(nid, (*starting_index));
-        n_left++;
-        this->hist_local_worker_.AddHistRow(nid);
-      }
-    }
-    for (auto const &nid : merged_node_ids) {
-      if (!(p_tree->IsLeftChild(nid))) {
-        this->hist_.AddHistRow(nid);
-        this->hist_local_worker_.AddHistRow(nid);
-      }
-    }
-    this->hist_.AllocateAllData();
-    this->hist_local_worker_.AllocateAllData();
-    (*sync_count) = std::max(1, n_left);
-  }
+  [[nodiscard]] BoundedHistCollection const &Histogram() const { return hist_; }
+  [[nodiscard]] BoundedHistCollection &Histogram() { return hist_; }
+  auto &Buffer() { return buffer_; }
 };

 // Construct a work space for building histogram.  Eventually we should move this
 // function into histogram builder once hist tree method supports external memory.
-template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
+template <typename Partitioner>
 common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
-                                          std::vector<ExpandEntry> const &nodes_to_build) {
-  std::vector<size_t> partition_size(nodes_to_build.size(), 0);
+                                          std::vector<bst_node_t> const &nodes_to_build) {
+  // FIXME(jiamingy): Handle different size of space.  Right now we use the maximum
+  // partition size for the buffer, which might not be efficient if partition sizes
+  // has significant variance.
+  std::vector<std::size_t> partition_size(nodes_to_build.size(), 0);
  for (auto const &partition : partitioners) {
    size_t k = 0;
-    for (auto node : nodes_to_build) {
-      auto n_rows_in_node = partition.Partitions()[node.nid].Size();
+    for (auto nidx : nodes_to_build) {
+      auto n_rows_in_node = partition.Partitions()[nidx].Size();
      partition_size[k] = std::max(partition_size[k], n_rows_in_node);
      k++;
    }
@@ -322,6 +235,107 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
      nodes_to_build.size(), [&](size_t nidx_in_set) { return partition_size[nidx_in_set]; }, 256};
  return space;
 }
-}      // namespace tree
-}      // namespace xgboost
+
+/**
+ * @brief Histogram builder that can handle multiple targets.
+ */
+class MultiHistogramBuilder {
+  std::vector<HistogramBuilder> target_builders_;
+  Context const *ctx_;
+
+ public:
+  /**
+   * @brief Build the histogram for root node.
+   */
+  template <typename Partitioner, typename ExpandEntry>
+  void BuildRootHist(DMatrix *p_fmat, RegTree const *p_tree,
+                     std::vector<Partitioner> const &partitioners,
+                     linalg::MatrixView<GradientPair const> gpair, ExpandEntry const &best,
+                     BatchParam const &param, bool force_read_by_column = false) {
+    auto n_targets = p_tree->NumTargets();
+    CHECK_EQ(gpair.Shape(1), n_targets);
+    CHECK_EQ(p_fmat->Info().num_row_, gpair.Shape(0));
+    CHECK_EQ(target_builders_.size(), n_targets);
+    std::vector<bst_node_t> nodes{best.nid};
+    std::vector<bst_node_t> dummy_sub;
+
+    auto space = ConstructHistSpace(partitioners, nodes);
+    for (bst_target_t t{0}; t < n_targets; ++t) {
+      this->target_builders_[t].AddHistRows(p_tree, &nodes, &dummy_sub, false);
+    }
+    CHECK(dummy_sub.empty());
+
+    std::size_t page_idx{0};
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        this->target_builders_[t].BuildHist(page_idx, space, gidx,
+                                            partitioners[page_idx].Partitions(), nodes, t_gpair,
+                                            force_read_by_column);
+      }
+      ++page_idx;
+    }
+
+    for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
+      this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
+    }
+  }
+  /**
+   * @brief Build histogram for left and right child of valid candidates
+   */
+  template <typename Partitioner, typename ExpandEntry>
+  void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
+                          std::vector<Partitioner> const &partitioners,
+                          std::vector<ExpandEntry> const &valid_candidates,
+                          linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
+                          bool force_read_by_column = false) {
+    std::vector<bst_node_t> nodes_to_build(valid_candidates.size());
+    std::vector<bst_node_t> nodes_to_sub(valid_candidates.size());
+    AssignNodes(p_tree, valid_candidates, nodes_to_build, nodes_to_sub);
+
+    // use the first builder for getting number of valid nodes.
+    target_builders_.front().AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, true);
+    CHECK_GE(nodes_to_build.size(), nodes_to_sub.size());
+    CHECK_EQ(nodes_to_sub.size() + nodes_to_build.size(), valid_candidates.size() * 2);
+
+    // allocate storage for the rest of the builders
+    for (bst_target_t t = 1; t < target_builders_.size(); ++t) {
+      target_builders_[t].AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, false);
+    }
+
+    auto space = ConstructHistSpace(partitioners, nodes_to_build);
+    std::size_t page_idx{0};
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
+      CHECK_EQ(gpair.Shape(1), p_tree->NumTargets());
+      for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        CHECK_EQ(t_gpair.Shape(0), p_fmat->Info().num_row_);
+        this->target_builders_[t].BuildHist(page_idx, space, page,
+                                            partitioners[page_idx].Partitions(), nodes_to_build,
+                                            t_gpair, force_read_by_column);
+      }
+      page_idx++;
+    }
+
+    for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
+      this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
+    }
+  }
+
+  [[nodiscard]] auto const &Histogram(bst_target_t t) const {
+    return target_builders_[t].Histogram();
+  }
+  [[nodiscard]] auto &Histogram(bst_target_t t) { return target_builders_[t].Histogram(); }
+
+  void Reset(Context const *ctx, bst_bin_t total_bins, bst_target_t n_targets, BatchParam const &p,
+             bool is_distributed, bool is_col_split, HistMakerTrainParam const *param) {
+    ctx_ = ctx;
+    target_builders_.resize(n_targets);
+    CHECK_GE(n_targets, 1);
+    for (auto &v : target_builders_) {
+      v.Reset(ctx, total_bins, p, is_distributed, is_col_split, param);
+    }
+  }
+};
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_HISTOGRAM_H_
--- a/src/tree/hist/param.cc
+++ b/src/tree/hist/param.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2021-2023, XGBoost Contributors
+ */
+#include "param.h"
+
+#include <string>  // for string
+
+#include "../../collective/communicator-inl.h"  // for GetRank, Broadcast
+#include "xgboost/json.h"                       // for Object, Json
+#include "xgboost/tree_model.h"                 // for RegTree
+
+namespace xgboost::tree {
+DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
+
+void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const {
+  if (!this->debug_synchronize) {
+    return;
+  }
+
+  std::string s_model;
+  Json model{Object{}};
+  int rank = collective::GetRank();
+  if (rank == 0) {
+    local_tree->SaveModel(&model);
+  }
+  Json::Dump(model, &s_model, std::ios::binary);
+  collective::Broadcast(&s_model, 0);
+
+  RegTree ref_tree{};  // rank 0 tree
+  auto j_ref_tree = Json::Load(StringView{s_model}, std::ios::binary);
+  ref_tree.LoadModel(j_ref_tree);
+  CHECK(*local_tree == ref_tree);
+}
+}  // namespace xgboost::tree
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2021-2023, XGBoost Contributors
+ */
+#pragma once
+
+#include <cstddef>  // for size_t
+
+#include "xgboost/parameter.h"   // for XGBoostParameter
+#include "xgboost/tree_model.h"  // for RegTree
+
+namespace xgboost::tree {
+struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
+  constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
+
+  bool debug_synchronize{false};
+  std::size_t max_cached_hist_node{DefaultNodes()};
+
+  void CheckTreesSynchronized(RegTree const* local_tree) const;
+
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
+    DMLC_DECLARE_FIELD(debug_synchronize)
+        .set_default(false)
+        .describe("Check if all distributed tree are identical after tree construction.");
+    DMLC_DECLARE_FIELD(max_cached_hist_node)
+        .set_default(DefaultNodes())
+        .set_lower_bound(1)
+        .describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");
+  }
+};
+}  // namespace xgboost::tree
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -526,7 +526,7 @@ struct SplitEntryContainer {
   * \return whether the proposed split is better and can replace current split
   */
  template <typename GradientSumT>
-  bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
+  bool Update(bst_float new_loss_chg, bst_feature_t split_index, float new_split_value,
              bool default_left, bool is_cat, GradientSumT const &left_sum,
              GradientSumT const &right_sum) {
    if (this->NeedReplace(new_loss_chg, split_index)) {
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -213,7 +213,7 @@ std::vector<bst_cat_t> GetSplitCategories(RegTree const &tree, int32_t nidx) {
  auto split = common::KCatBitField{csr.categories.subspan(seg.beg, seg.size)};

  std::vector<bst_cat_t> cats;
-  for (size_t i = 0; i < split.Size(); ++i) {
+  for (size_t i = 0; i < split.Capacity(); ++i) {
    if (split.Check(i)) {
      cats.push_back(static_cast<bst_cat_t>(i));
    }
@@ -398,11 +398,14 @@ class JsonGenerator : public TreeGenerator {
    static std::string const kIndicatorTemplate =
        R"ID( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", "yes": {yes}, "no": {no})ID";
    auto split_index = tree[nid].SplitIndex();
+    auto fname = fmap_.Name(split_index);
+    std::string qfname;  // quoted
+    common::EscapeU8(fname, &qfname);
    auto result = SuperT::Match(
        kIndicatorTemplate,
        {{"{nid}",   std::to_string(nid)},
         {"{depth}", std::to_string(depth)},
-         {"{fname}", fmap_.Name(split_index)},
+         {"{fname}", qfname},
         {"{yes}",   std::to_string(nyes)},
         {"{no}",    std::to_string(tree[nid].DefaultChild())}});
    return result;
@@ -430,12 +433,14 @@ class JsonGenerator : public TreeGenerator {
                            std::string const &template_str, std::string cond,
                            uint32_t depth) const {
    auto split_index = tree[nid].SplitIndex();
+    auto fname = split_index < fmap_.Size() ? fmap_.Name(split_index) : std::to_string(split_index);
+    std::string qfname;  // quoted
+    common::EscapeU8(fname, &qfname);
    std::string const result = SuperT::Match(
        template_str,
        {{"{nid}",     std::to_string(nid)},
         {"{depth}",   std::to_string(depth)},
-         {"{fname}",   split_index < fmap_.Size() ? fmap_.Name(split_index) :
-                                                    std::to_string(split_index)},
+         {"{fname}",   qfname},
         {"{cond}",    cond},
         {"{left}",    std::to_string(tree[nid].LeftChild())},
         {"{right}",   std::to_string(tree[nid].RightChild())},
@@ -1004,7 +1009,7 @@ void RegTree::SaveCategoricalSplit(Json* p_out) const {
      auto segment = split_categories_segments_[i];
      auto node_categories = this->GetSplitCategories().subspan(segment.beg, segment.size);
      common::KCatBitField const cat_bits(node_categories);
-      for (size_t i = 0; i < cat_bits.Size(); ++i) {
+      for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
        if (cat_bits.Check(i)) {
          categories.GetArray().emplace_back(i);
        }
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -3,27 +3,39 @@
 *
 * \brief Implementation for the approx tree method.
 */
-#include <algorithm>
-#include <memory>
-#include <vector>
+#include <algorithm>  // for max, transform, fill_n
+#include <cstddef>    // for size_t
+#include <map>        // for map
+#include <memory>     // for allocator, unique_ptr, make_shared, make_unique
+#include <utility>    // for move
+#include <vector>     // for vector

-#include "../collective/aggregator.h"
-#include "../common/random.h"
-#include "../data/gradient_index.h"
-#include "common_row_partitioner.h"
-#include "constraints.h"
-#include "driver.h"
-#include "hist/evaluate_splits.h"
-#include "hist/histogram.h"
-#include "hist/sampler.h"  // for SampleGradient
-#include "param.h"
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/json.h"
-#include "xgboost/linalg.h"
-#include "xgboost/task.h"          // for ObjInfo
-#include "xgboost/tree_model.h"
-#include "xgboost/tree_updater.h"  // for TreeUpdater
+#include "../collective/aggregator.h"        // for GlobalSum
+#include "../collective/communicator-inl.h"  // for IsDistributed
+#include "../common/hist_util.h"             // for HistogramCuts
+#include "../common/random.h"                // for ColumnSampler
+#include "../common/timer.h"                 // for Monitor
+#include "../data/gradient_index.h"          // for GHistIndexMatrix
+#include "common_row_partitioner.h"          // for CommonRowPartitioner
+#include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
+#include "driver.h"                          // for Driver
+#include "hist/evaluate_splits.h"            // for HistEvaluator, UpdatePredictionCacheImpl
+#include "hist/expand_entry.h"               // for CPUExpandEntry
+#include "hist/histogram.h"                  // for MultiHistogramBuilder
+#include "hist/param.h"                      // for HistMakerTrainParam
+#include "hist/sampler.h"                    // for SampleGradient
+#include "param.h"                           // for GradStats, TrainParam
+#include "xgboost/base.h"                    // for Args, GradientPair, bst_node_t, bst_bin_t
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for DMatrix, BatchSet, BatchIterator, MetaInfo
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/json.h"                    // for Object, Json, FromJson, ToJson, get
+#include "xgboost/linalg.h"                  // for Matrix, MakeTensorView, Empty, MatrixView
+#include "xgboost/logging.h"                 // for LogCheck_EQ, CHECK_EQ, CHECK
+#include "xgboost/span.h"                    // for Span
+#include "xgboost/task.h"                    // for ObjInfo
+#include "xgboost/tree_model.h"              // for RegTree, RTreeNodeStat
+#include "xgboost/tree_updater.h"            // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...

 namespace xgboost::tree {

@@ -43,9 +55,10 @@ auto BatchSpec(TrainParam const &p, common::Span<float> hess) {
 class GloablApproxBuilder {
 protected:
  TrainParam const *param_;
+  HistMakerTrainParam const *hist_param_{nullptr};
  std::shared_ptr<common::ColumnSampler> col_sampler_;
  HistEvaluator evaluator_;
-  HistogramBuilder<CPUExpandEntry> histogram_builder_;
+  MultiHistogramBuilder histogram_builder_;
  Context const *ctx_;
  ObjInfo const *const task_;

@@ -58,7 +71,7 @@ class GloablApproxBuilder {
  common::HistogramCuts feature_values_;

 public:
-  void InitData(DMatrix *p_fmat, common::Span<float> hess) {
+  void InitData(DMatrix *p_fmat, RegTree const *p_tree, common::Span<float> hess) {
    monitor_->Start(__func__);

    n_batches_ = 0;
@@ -78,8 +91,9 @@ class GloablApproxBuilder {
      n_batches_++;
    }

-    histogram_builder_.Reset(n_total_bins, BatchSpec(*param_, hess), ctx_->Threads(), n_batches_,
-                             collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
+    histogram_builder_.Reset(ctx_, n_total_bins, p_tree->NumTargets(), BatchSpec(*param_, hess),
+                             collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
+                             hist_param_);
    monitor_->Stop(__func__);
  }

@@ -95,20 +109,16 @@ class GloablApproxBuilder {
    }
    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
    std::vector<CPUExpandEntry> nodes{best};
-    size_t i = 0;
-    auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
-      histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
-                                   {}, gpair);
-      i++;
-    }
+    this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
+                                           linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
+                                           best, BatchSpec(*param_, hess));

    auto weight = evaluator_.InitRoot(root_sum);
    p_tree->Stat(RegTree::kRoot).sum_hess = root_sum.GetHess();
    p_tree->Stat(RegTree::kRoot).base_weight = weight;
    (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);

-    auto const &histograms = histogram_builder_.Histogram();
+    auto const &histograms = histogram_builder_.Histogram(0);
    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
    evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &nodes);
    monitor_->Stop(__func__);
@@ -129,30 +139,9 @@ class GloablApproxBuilder {
                      std::vector<CPUExpandEntry> const &valid_candidates,
                      std::vector<GradientPair> const &gpair, common::Span<float> hess) {
    monitor_->Start(__func__);
-    std::vector<CPUExpandEntry> nodes_to_build;
-    std::vector<CPUExpandEntry> nodes_to_sub;
-
-    for (auto const &c : valid_candidates) {
-      auto left_nidx = (*p_tree)[c.nid].LeftChild();
-      auto right_nidx = (*p_tree)[c.nid].RightChild();
-      auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
-
-      auto build_nidx = left_nidx;
-      auto subtract_nidx = right_nidx;
-      if (fewer_right) {
-        std::swap(build_nidx, subtract_nidx);
-      }
-      nodes_to_build.push_back(CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}});
-      nodes_to_sub.push_back(CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}});
-    }
-
-    size_t i = 0;
-    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
-      histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
-                                   nodes_to_build, nodes_to_sub, gpair);
-      i++;
-    }
+    this->histogram_builder_.BuildHistLeftRight(
+        p_fmat, p_tree, partitioner_, valid_candidates,
+        linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
    monitor_->Stop(__func__);
  }

@@ -169,10 +158,12 @@ class GloablApproxBuilder {
  }

 public:
-  explicit GloablApproxBuilder(TrainParam const *param, MetaInfo const &info, Context const *ctx,
+  explicit GloablApproxBuilder(TrainParam const *param, HistMakerTrainParam const *hist_param,
+                               MetaInfo const &info, Context const *ctx,
                               std::shared_ptr<common::ColumnSampler> column_sampler,
                               ObjInfo const *task, common::Monitor *monitor)
      : param_{param},
+        hist_param_{hist_param},
        col_sampler_{std::move(column_sampler)},
        evaluator_{ctx, param_, info, col_sampler_},
        ctx_{ctx},
@@ -182,7 +173,7 @@ class GloablApproxBuilder {
  void UpdateTree(DMatrix *p_fmat, std::vector<GradientPair> const &gpair, common::Span<float> hess,
                  RegTree *p_tree, HostDeviceVector<bst_node_t> *p_out_position) {
    p_last_tree_ = p_tree;
-    this->InitData(p_fmat, hess);
+    this->InitData(p_fmat, p_tree, hess);

    Driver<CPUExpandEntry> driver(*param_);
    auto &tree = *p_tree;
@@ -232,7 +223,7 @@ class GloablApproxBuilder {
          best_splits.push_back(l_best);
          best_splits.push_back(r_best);
        }
-        auto const &histograms = histogram_builder_.Histogram();
+        auto const &histograms = histogram_builder_.Histogram(0);
        auto ft = p_fmat->Info().feature_types.ConstHostSpan();
        monitor_->Start("EvaluateSplits");
        evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &best_splits);
@@ -260,6 +251,7 @@ class GlobalApproxUpdater : public TreeUpdater {
  std::shared_ptr<common::ColumnSampler> column_sampler_ =
      std::make_shared<common::ColumnSampler>();
  ObjInfo const *task_;
+  HistMakerTrainParam hist_param_;

 public:
  explicit GlobalApproxUpdater(Context const *ctx, ObjInfo const *task)
@@ -267,25 +259,33 @@ class GlobalApproxUpdater : public TreeUpdater {
    monitor_.Init(__func__);
  }

-  void Configure(Args const &) override {}
-  void LoadConfig(Json const &) override {}
-  void SaveConfig(Json *) const override {}
+  void Configure(Args const &args) override { hist_param_.UpdateAllowUnknown(args); }
+  void LoadConfig(Json const &in) override {
+    auto const &config = get<Object const>(in);
+    FromJson(config.at("hist_train_param"), &hist_param_);
+  }
+  void SaveConfig(Json *p_out) const override {
+    auto &out = *p_out;
+    out["hist_train_param"] = ToJson(hist_param_);
+  }

-  void InitData(TrainParam const &param, HostDeviceVector<GradientPair> const *gpair,
+  void InitData(TrainParam const &param, linalg::Matrix<GradientPair> const *gpair,
                linalg::Matrix<GradientPair> *sampled) {
    *sampled = linalg::Empty<GradientPair>(ctx_, gpair->Size(), 1);
-    sampled->Data()->Copy(*gpair);
+    auto in = gpair->HostView().Values();
+    std::copy(in.data(), in.data() + in.size(), sampled->HostView().Values().data());

    SampleGradient(ctx_, param, sampled->HostView());
  }

  [[nodiscard]] char const *Name() const override { return "grow_histmaker"; }

-  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *m,
+  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *m,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree *> &trees) override {
-    pimpl_ = std::make_unique<GloablApproxBuilder>(param, m->Info(), ctx_, column_sampler_, task_,
-                                                   &monitor_);
+    CHECK(hist_param_.GetInitialised());
+    pimpl_ = std::make_unique<GloablApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
+                                                   column_sampler_, task_, &monitor_);

    linalg::Matrix<GradientPair> h_gpair;
    // Obtain the hessian values for weighted sketching
@@ -300,6 +300,7 @@ class GlobalApproxUpdater : public TreeUpdater {
    std::size_t t_idx = 0;
    for (auto p_tree : trees) {
      this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
+      hist_param_.CheckTreesSynchronized(p_tree);
      ++t_idx;
    }
  }
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -91,7 +91,7 @@ class ColMaker: public TreeUpdater {
    }
  }

-  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *dmat,
              common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
              const std::vector<RegTree *> &trees) override {
    if (collective::IsDistributed()) {
@@ -106,10 +106,11 @@ class ColMaker: public TreeUpdater {
    // rescale learning rate according to size of trees
    interaction_constraints_.Configure(*param, dmat->Info().num_row_);
    // build tree
+    CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
    for (auto tree : trees) {
      CHECK(ctx_);
      Builder builder(*param, colmaker_param_, interaction_constraints_, ctx_, column_densities_);
-      builder.Update(gpair->ConstHostVector(), dmat, tree);
+      builder.Update(gpair->Data()->ConstHostVector(), dmat, tree);
    }
  }

--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -72,7 +72,6 @@ struct DeviceSplitCandidate {
  // split.
  bst_cat_t thresh{-1};

-  common::CatBitField split_cats;
  bool is_cat { false };

  GradientPairInt64 left_sum;
@@ -80,12 +79,6 @@ struct DeviceSplitCandidate {

  XGBOOST_DEVICE DeviceSplitCandidate() {}  // NOLINT

-  template <typename T>
-  XGBOOST_DEVICE void SetCat(T c) {
-    this->split_cats.Set(common::AsCat(c));
-    fvalue = std::max(this->fvalue, static_cast<float>(c));
-  }
-
  XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in,
                             int findex_in, GradientPairInt64 left_sum_in,
                             GradientPairInt64 right_sum_in, bool cat,
@@ -108,22 +101,23 @@ struct DeviceSplitCandidate {
   */
  XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in,
                                bst_feature_t findex_in, GradientPairInt64 left_sum_in,
-                                GradientPairInt64 right_sum_in, GPUTrainingParam const& param, const GradientQuantiser& quantiser) {
-    if (loss_chg_in > loss_chg &&
-        quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
-        quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
-      loss_chg = loss_chg_in;
-      dir = dir_in;
-      fvalue = std::numeric_limits<float>::quiet_NaN();
-      thresh = thresh_in;
-      is_cat = true;
-      left_sum = left_sum_in;
-      right_sum = right_sum_in;
-      findex = findex_in;
-    }
+                                GradientPairInt64 right_sum_in, GPUTrainingParam const& param,
+                                const GradientQuantiser& quantiser) {
+      if (loss_chg_in > loss_chg &&
+          quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
+          quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
+        loss_chg = loss_chg_in;
+        dir = dir_in;
+        fvalue = std::numeric_limits<float>::quiet_NaN();
+        thresh = thresh_in;
+        is_cat = true;
+        left_sum = left_sum_in;
+        right_sum = right_sum_in;
+        findex = findex_in;
+      }
  }

-  XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
+  [[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }

  friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
    os << "loss_chg:" << c.loss_chg << ", "
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -7,12 +7,13 @@

 #include <algorithm>
 #include <cmath>
-#include <limits>
-#include <memory>
-#include <utility>
+#include <cstddef>  // for size_t
+#include <memory>   // for unique_ptr, make_unique
+#include <utility>  // for move
 #include <vector>

-#include "../collective/communicator-inl.cuh"
+#include "../collective/aggregator.h"
+#include "../collective/aggregator.cuh"
 #include "../common/bitfield.h"
 #include "../common/categorical.h"

@@ -22,6 +23,7 @@
 #include "../common/io.h"
 #include "../common/timer.h"
 #include "../data/ellpack_page.cuh"
+#include "../data/ellpack_page.h"
 #include "constraints.cuh"
 #include "driver.h"
 #include "gpu_hist/evaluate_splits.cuh"
@@ -30,8 +32,8 @@
 #include "gpu_hist/gradient_based_sampler.cuh"
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/row_partitioner.cuh"
+#include "hist/param.h"
 #include "param.h"
-#include "split_evaluator.h"
 #include "updater_gpu_common.cuh"
 #include "xgboost/base.h"
 #include "xgboost/context.h"
@@ -48,20 +50,6 @@ namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 #endif  // !defined(GTEST_TEST)

-// training parameters specific to this algorithm
-struct GPUHistMakerTrainParam
-    : public XGBoostParameter<GPUHistMakerTrainParam> {
-  bool debug_synchronize;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
-    DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
-        "Check if all distributed tree are identical after tree construction.");
-  }
-};
-#if !defined(GTEST_TEST)
-DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
-#endif  // !defined(GTEST_TEST)
-
 /**
 * \struct  DeviceHistogramStorage
 *
@@ -170,16 +158,16 @@ class DeviceHistogramStorage {
 };

 // Manage memory for a single GPU
-template <typename GradientSumT>
 struct GPUHistMakerDevice {
 private:
  GPUHistEvaluator evaluator_;
  Context const* ctx_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
+  MetaInfo const& info_;

 public:
-  EllpackPageImpl const* page;
+  EllpackPageImpl const* page{nullptr};
  common::Span<FeatureType const> feature_types;
-  BatchParam batch_param;

  std::unique_ptr<RowPartitioner> row_partitioner;
  DeviceHistogramStorage<> hist{};
@@ -199,98 +187,95 @@ struct GPUHistMakerDevice {
  dh::PinnedMemory pinned2;

  common::Monitor monitor;
-  common::ColumnSampler column_sampler;
  FeatureInteractionConstraintDevice interaction_constraints;

  std::unique_ptr<GradientBasedSampler> sampler;

  std::unique_ptr<FeatureGroups> feature_groups;

-
-  GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
-                     common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
-                     TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
-                     BatchParam _batch_param)
+  GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
+                     common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
+                     TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
+                     uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
      : evaluator_{_param, n_features, ctx->gpu_id},
        ctx_(ctx),
-        page(_page),
        feature_types{_feature_types},
        param(std::move(_param)),
-        column_sampler(column_sampler_seed),
+        column_sampler_(std::move(column_sampler)),
        interaction_constraints(param, n_features),
-        batch_param(std::move(_batch_param)) {
-    sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
-                                           param.sampling_method));
+        info_{info} {
+    sampler = std::make_unique<GradientBasedSampler>(ctx, _n_rows, batch_param, param.subsample,
+                                                     param.sampling_method, is_external_memory);
    if (!param.monotone_constraints.empty()) {
      // Copy assigning an empty vector causes an exception in MSVC debug builds
      monotone_constraints = param.monotone_constraints;
    }

-    // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    CHECK(column_sampler_);
    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
-    feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
-                                           dh::MaxSharedMemoryOptin(ctx_->gpu_id),
-                                           sizeof(GradientSumT)));
  }

-  ~GPUHistMakerDevice() {  // NOLINT
-#if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
+  ~GPUHistMakerDevice() = default;
+
+  void InitFeatureGroupsOnce() {
+    if (!feature_groups) {
+      CHECK(page);
+      feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
+                                                       dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                                       sizeof(GradientPairPrecise));
+    }
  }

  // Reset values for each update iteration
-  // Note that the column sampler must be passed by value because it is not
-  // thread safe
  void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
    auto const& info = dmat->Info();
-    this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
-                              param.colsample_bynode, param.colsample_bylevel,
-                              param.colsample_bytree);
+    this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
+                                param.colsample_bynode, param.colsample_bylevel,
+                                param.colsample_bytree);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif

-    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
-                           ctx_->gpu_id);
-
    this->interaction_constraints.Reset();

    if (d_gpair.size() != dh_gpair->Size()) {
      d_gpair.resize(dh_gpair->Size());
    }
-
 #if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaMemcpyAsync(
-        d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
-        dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
+    dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
+                                  dh_gpair->Size() * sizeof(GradientPair),
+                                  cudaMemcpyDeviceToDevice));
 #elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(
-        d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
-        dh_gpair->Size() * sizeof(GradientPair), hipMemcpyDeviceToDevice));
+    dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
+                                  dh_gpair->Size() * sizeof(GradientPair),
+                                  hipMemcpyDeviceToDevice));
 #endif
-
    auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
    page = sample.page;
    gpair = sample.gpair;

-    quantiser.reset(new GradientQuantiser(this->gpair));
+    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
+                           dmat->Info().IsColumnSplit(), ctx_->gpu_id);
+
+    quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());

    row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id,  sample.sample_rows));
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
+
+    // Init histogram
+    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
    hist.Reset();
+
+    this->InitFeatureGroupsOnce();
  }

  GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
    int nidx = RegTree::kRoot;
    GPUTrainingParam gpu_param(param);
-    auto sampled_features = column_sampler.GetFeatureSet(0);
-    sampled_features->SetDevice(ctx_->gpu_id);
+    auto sampled_features = column_sampler_->GetFeatureSet(0);
+    sampled_features->SetDevice(ctx_->Device());
    common::Span<bst_feature_t> feature_set =
        interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
@@ -324,19 +309,19 @@ struct GPUHistMakerDevice {
    dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
    // Store the feature set ptrs so they dont go out of scope before the kernel is called
    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
-    for (size_t i = 0; i < candidates.size(); i++) {
+    for (std::size_t i = 0; i < candidates.size(); i++) {
      auto candidate = candidates.at(i);
      int left_nidx = tree[candidate.nid].LeftChild();
      int right_nidx = tree[candidate.nid].RightChild();
      nidx[i * 2] = left_nidx;
      nidx[i * 2 + 1] = right_nidx;
-      auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
-      left_sampled_features->SetDevice(ctx_->gpu_id);
+      auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
+      left_sampled_features->SetDevice(ctx_->Device());
      feature_sets.emplace_back(left_sampled_features);
      common::Span<bst_feature_t> left_feature_set =
          interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
-      auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
-      right_sampled_features->SetDevice(ctx_->gpu_id);
+      auto right_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(right_nidx));
+      right_sampled_features->SetDevice(ctx_->Device());
      feature_sets.emplace_back(right_sampled_features);
      common::Span<bst_feature_t> right_feature_set =
          interaction_constraints.Query(right_sampled_features->DeviceSpan(),
@@ -363,10 +348,8 @@ struct GPUHistMakerDevice {
        h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
 #endif

-    this->evaluator_.EvaluateSplits(nidx, max_active_features,
-                                    dh::ToSpan(d_node_inputs), shared_inputs,
-                                    dh::ToSpan(entries));
-
+    this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
+                                    shared_inputs, dh::ToSpan(entries));
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
@@ -378,7 +361,7 @@ struct GPUHistMakerDevice {
 #endif

    dh::DefaultStream().Sync();
-    }
+  }

  void BuildHist(int nidx) {
    auto d_node_hist = hist.GetNodeHistogram(nidx);
@@ -410,31 +393,108 @@ struct GPUHistMakerDevice {
  struct NodeSplitData {
    RegTree::Node split_node;
    FeatureType split_type;
-    common::CatBitField node_cats;
+    common::KCatBitField node_cats;
  };

-  void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
-    if (candidates.empty()) return;
-    std::vector<int> nidx(candidates.size());
-    std::vector<int> left_nidx(candidates.size());
-    std::vector<int> right_nidx(candidates.size());
+  void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
+                                 std::vector<NodeSplitData> const& split_data,
+                                 std::vector<bst_node_t> const& nidx,
+                                 std::vector<bst_node_t> const& left_nidx,
+                                 std::vector<bst_node_t> const& right_nidx) {
+    auto const num_candidates = split_data.size();
+
+    using BitVector = LBitField64;
+    using BitType = BitVector::value_type;
+    auto const size = BitVector::ComputeStorageSize(d_matrix.n_rows * num_candidates);
+    dh::TemporaryArray<BitType> decision_storage(size, 0);
+    dh::TemporaryArray<BitType> missing_storage(size, 0);
+    BitVector decision_bits{dh::ToSpan(decision_storage)};
+    BitVector missing_bits{dh::ToSpan(missing_storage)};
+
+    dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
+#if defined(XGBOOST_USE_CUDA)
+    dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
+                                  num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
+#elif defined(XGBOOST_USE_HIP)
+    dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
+                                  num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
+#endif
+    auto d_split_data = dh::ToSpan(split_data_storage);
+
+    dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
+      for (auto i = 0; i < num_candidates; i++) {
+        auto const& data = d_split_data[i];
+        auto const cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
+        if (isnan(cut_value)) {
+          missing_bits.Set(ridx * num_candidates + i);
+        } else {
+          bool go_left;
+          if (data.split_type == FeatureType::kCategorical) {
+            go_left = common::Decision(data.node_cats.Bits(), cut_value);
+          } else {
+            go_left = cut_value <= data.split_node.SplitCond();
+          }
+          if (go_left) {
+            decision_bits.Set(ridx * num_candidates + i);
+          }
+        }
+      }
+    });
+
+    collective::AllReduce<collective::Operation::kBitwiseOR>(
+        ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
+    collective::AllReduce<collective::Operation::kBitwiseAND>(
+        ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
+    collective::Synchronize(ctx_->gpu_id);
+
+    row_partitioner->UpdatePositionBatch(
+        nidx, left_nidx, right_nidx, split_data,
+        [=] __device__(bst_uint ridx, int split_index, NodeSplitData const& data) {
+          auto const index = ridx * num_candidates + split_index;
+          bool go_left;
+          if (missing_bits.Check(index)) {
+            go_left = data.split_node.DefaultLeft();
+          } else {
+            go_left = decision_bits.Check(index);
+          }
+          return go_left;
+        });
+  }
+
+  void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
+    if (candidates.empty()) {
+      return;
+    }
+
+    std::vector<bst_node_t> nidx(candidates.size());
+    std::vector<bst_node_t> left_nidx(candidates.size());
+    std::vector<bst_node_t> right_nidx(candidates.size());
    std::vector<NodeSplitData> split_data(candidates.size());
+
    for (size_t i = 0; i < candidates.size(); i++) {
-      auto& e = candidates[i];
+      auto const& e = candidates[i];
      RegTree::Node split_node = (*p_tree)[e.nid];
      auto split_type = p_tree->NodeSplitType(e.nid);
      nidx.at(i) = e.nid;
      left_nidx.at(i) = split_node.LeftChild();
      right_nidx.at(i) = split_node.RightChild();
-      split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
+      split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
+
+      CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
    }

    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+
+    if (info_.IsColumnSplit()) {
+      UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
+      return;
+    }
+
    row_partitioner->UpdatePositionBatch(
        nidx, left_nidx, right_nidx, split_data,
-        [=] __device__(bst_uint ridx, const NodeSplitData& data) {
+        [=] __device__(bst_uint ridx, int split_index, const NodeSplitData& data) {
          // given a row index, returns the node id it belongs to
-          bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
+          float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
          // Missing value
          bool go_left = true;
          if (isnan(cut_value)) {
@@ -569,14 +629,14 @@ struct GPUHistMakerDevice {
    }

    CHECK(p_tree);
+    CHECK(out_preds_d.Device().IsCUDA());
+    CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());

 #if defined(XGBOOST_USE_CUDA)
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
 #elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
 #endif
-    CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
-
    auto d_position = dh::ToSpan(positions);
    CHECK_EQ(out_preds_d.Size(), d_position.size());

@@ -609,9 +669,8 @@ struct GPUHistMakerDevice {
    monitor.Start("AllReduce");
    auto d_node_hist = hist.GetNodeHistogram(nidx).data();
    using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-    collective::AllReduce<collective::Operation::kSum>(
-        ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
-        page->Cuts().TotalBins() * 2 * num_histograms);
+    collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
+                          page->Cuts().TotalBins() * 2 * num_histograms);

    monitor.Stop("AllReduce");
  }
@@ -692,7 +751,6 @@ struct GPUHistMakerDevice {
      CHECK(common::CheckNAN(candidate.split.fvalue));
      std::vector<common::CatBitField::value_type> split_cats;

-      CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
      auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
      split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
@@ -713,7 +771,6 @@ struct GPUHistMakerDevice {
    evaluator_.ApplyTreeSplit(candidate, p_tree);

    const auto& parent = tree[candidate.nid];
-    std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild());
    interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
                                  parent.RightChild());
  }
@@ -730,8 +787,7 @@ struct GPUHistMakerDevice {
        dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
                   GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
    using ReduceT = typename decltype(root_sum_quantised)::ValueT;
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<ReduceT *>(&root_sum_quantised), 2);
+    collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);

    hist.AllocateHistograms({kRootNIdx});
    this->BuildHist(kRootNIdx);
@@ -749,9 +805,8 @@ struct GPUHistMakerDevice {
    return root_entry;
  }

-  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
-                  ObjInfo const* task, RegTree* p_tree,
-                  HostDeviceVector<bst_node_t>* p_out_position) {
+  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
+                  RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
    auto& tree = *p_tree;
    // Process maximum 32 nodes at a time
    Driver<GPUExpandEntry> driver(param, 32);
@@ -776,7 +831,6 @@ struct GPUHistMakerDevice {
      std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
                   [&](const auto& e) { return driver.IsChildValid(e); });

-
      auto new_candidates =
          pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());

@@ -809,8 +863,7 @@ class GPUHistMaker : public TreeUpdater {
  using GradientSumT = GradientPairPrecise;

 public:
-  explicit GPUHistMaker(Context const* ctx, ObjInfo const* task)
-      : TreeUpdater(ctx), task_{task} {};
+  explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {};
  void Configure(const Args& args) override {
    // Used in test to count how many configurations are performed
    LOG(DEBUG) << "[GPU Hist]: Configure";
@@ -823,32 +876,31 @@ class GPUHistMaker : public TreeUpdater {

  void LoadConfig(Json const& in) override {
    auto const& config = get<Object const>(in);
-    FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
+    FromJson(config.at("hist_train_param"), &this->hist_maker_param_);
    initialised_ = false;
  }
  void SaveConfig(Json* p_out) const override {
    auto& out = *p_out;
-    out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
+    out["hist_train_param"] = ToJson(hist_maker_param_);
  }

  ~GPUHistMaker() {  // NOLINT
    dh::GlobalMemoryLogger().Log();
  }

-  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) override {
    monitor_.Start("Update");

+    CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
+    auto gpair_hdv = gpair->Data();
    // build tree
    try {
-      size_t t_idx{0};
+      std::size_t t_idx{0};
      for (xgboost::RegTree* tree : trees) {
-        this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
-
-        if (hist_maker_param_.debug_synchronize) {
-          this->CheckTreesSynchronized(tree);
-        }
+        this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
+        this->hist_maker_param_.CheckTreesSynchronized(tree);
        ++t_idx;
      }

@@ -870,9 +922,9 @@ class GPUHistMaker : public TreeUpdater {
    // Synchronise the column sampling seed
    uint32_t column_sampling_seed = common::GlobalRandom()();
    collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
+    this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);

    auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-    auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
@@ -880,9 +932,9 @@ class GPUHistMaker : public TreeUpdater {
 #endif

    info_->feature_types.SetDevice(ctx_->gpu_id);
-    maker.reset(new GPUHistMakerDevice<GradientSumT>(
-        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
-        column_sampling_seed, info_->num_col_, batch_param));
+    maker = std::make_unique<GPUHistMakerDevice>(
+        ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
+        *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());

    p_last_fmat_ = dmat;
    initialised_ = true;
@@ -895,21 +947,7 @@ class GPUHistMaker : public TreeUpdater {
      monitor_.Stop("InitDataOnce");
    }
    p_last_tree_ = p_tree;
-  }
-
-  // Only call this method for testing
-  void CheckTreesSynchronized(RegTree* local_tree) const {
-    std::string s_model;
-    common::MemoryBufferStream fs(&s_model);
-    int rank = collective::GetRank();
-    if (rank == 0) {
-      local_tree->Save(&fs);
-    }
-    fs.Seek(0);
-    collective::Broadcast(&s_model, 0);
-    RegTree reference_tree{};  // rank 0 tree
-    reference_tree.Load(&fs);
-    CHECK(*local_tree == reference_tree);
+    CHECK(hist_maker_param_.GetInitialised());
  }

  void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
@@ -935,7 +973,7 @@ class GPUHistMaker : public TreeUpdater {

  MetaInfo* info_{};  // NOLINT

-  std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker;  // NOLINT
+  std::unique_ptr<GPUHistMakerDevice> maker;  // NOLINT

  [[nodiscard]] char const* Name() const override { return "grow_gpu_hist"; }
  [[nodiscard]] bool HasNodePosition() const override { return true; }
@@ -943,13 +981,14 @@ class GPUHistMaker : public TreeUpdater {
 private:
  bool initialised_{false};

-  GPUHistMakerTrainParam hist_maker_param_;
+  HistMakerTrainParam hist_maker_param_;

  DMatrix* p_last_fmat_{nullptr};
  RegTree const* p_last_tree_{nullptr};
  ObjInfo const* task_{nullptr};

  common::Monitor monitor_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
 };

 #if !defined(GTEST_TEST)
@@ -959,4 +998,135 @@ XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
      return new GPUHistMaker(ctx, task);
    });
 #endif  // !defined(GTEST_TEST)
+
+class GPUGlobalApproxMaker : public TreeUpdater {
+ public:
+  explicit GPUGlobalApproxMaker(Context const* ctx, ObjInfo const* task)
+      : TreeUpdater(ctx), task_{task} {};
+  void Configure(Args const& args) override {
+    // Used in test to count how many configurations are performed
+    LOG(DEBUG) << "[GPU Approx]: Configure";
+    hist_maker_param_.UpdateAllowUnknown(args);
+    if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
+      LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
+    }
+    dh::CheckComputeCapability();
+    initialised_ = false;
+
+    monitor_.Init(this->Name());
+  }
+
+  void LoadConfig(Json const& in) override {
+    auto const& config = get<Object const>(in);
+    FromJson(config.at("hist_train_param"), &this->hist_maker_param_);
+    initialised_ = false;
+  }
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["hist_train_param"] = ToJson(hist_maker_param_);
+  }
+  ~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
+
+  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree*>& trees) override {
+    monitor_.Start("Update");
+
+    this->InitDataOnce(p_fmat);
+    // build tree
+    hess_.resize(gpair->Size());
+    auto hess = dh::ToSpan(hess_);
+
+    gpair->SetDevice(ctx_->Device());
+    auto d_gpair = gpair->Data()->ConstDeviceSpan();
+    auto cuctx = ctx_->CUDACtx();
+    thrust::transform(cuctx->CTP(), dh::tcbegin(d_gpair), dh::tcend(d_gpair), dh::tbegin(hess),
+                      [=] XGBOOST_DEVICE(GradientPair const& g) { return g.GetHess(); });
+
+    auto const& info = p_fmat->Info();
+    info.feature_types.SetDevice(ctx_->Device());
+    auto batch = BatchParam{param->max_bin, hess, !task_->const_hess};
+    maker_ = std::make_unique<GPUHistMakerDevice>(
+        ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_,
+        *param, column_sampler_, info.num_col_, batch, p_fmat->Info());
+
+    std::size_t t_idx{0};
+    for (xgboost::RegTree* tree : trees) {
+      this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]);
+      this->hist_maker_param_.CheckTreesSynchronized(tree);
+      ++t_idx;
+    }
+
+    monitor_.Stop("Update");
+  }
+
+  void InitDataOnce(DMatrix* p_fmat) {
+    if (this->initialised_) {
+      return;
+    }
+
+    monitor_.Start(__func__);
+    CHECK(ctx_->IsCUDA()) << error::InvalidCUDAOrdinal();
+    // Synchronise the column sampling seed
+    uint32_t column_sampling_seed = common::GlobalRandom()();
+    collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
+    this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
+
+    p_last_fmat_ = p_fmat;
+    initialised_ = true;
+    monitor_.Stop(__func__);
+  }
+
+  void InitData(DMatrix* p_fmat, RegTree const* p_tree) {
+    this->InitDataOnce(p_fmat);
+    p_last_tree_ = p_tree;
+    CHECK(hist_maker_param_.GetInitialised());
+  }
+
+  void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
+                  HostDeviceVector<bst_node_t>* p_out_position) {
+    monitor_.Start("InitData");
+    this->InitData(p_fmat, p_tree);
+    monitor_.Stop("InitData");
+
+    gpair->SetDevice(ctx_->gpu_id);
+    maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
+  }
+
+  bool UpdatePredictionCache(const DMatrix* data,
+                             linalg::MatrixView<bst_float> p_out_preds) override {
+    if (maker_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
+      return false;
+    }
+    monitor_.Start("UpdatePredictionCache");
+    bool result = maker_->UpdatePredictionCache(p_out_preds, p_last_tree_);
+    monitor_.Stop("UpdatePredictionCache");
+    return result;
+  }
+
+  [[nodiscard]] char const* Name() const override { return "grow_gpu_approx"; }
+  [[nodiscard]] bool HasNodePosition() const override { return true; }
+
+ private:
+  bool initialised_{false};
+
+  HistMakerTrainParam hist_maker_param_;
+  dh::device_vector<float> hess_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
+  std::unique_ptr<GPUHistMakerDevice> maker_;
+
+  DMatrix* p_last_fmat_{nullptr};
+  RegTree const* p_last_tree_{nullptr};
+  ObjInfo const* task_{nullptr};
+
+  common::Monitor monitor_;
+};
+
+#if !defined(GTEST_TEST)
+XGBOOST_REGISTER_TREE_UPDATER(GPUApproxMaker, "grow_gpu_approx")
+    .describe("Grow tree with GPU.")
+    .set_body([](Context const* ctx, ObjInfo const* task) {
+      return new GPUGlobalApproxMaker(ctx, task);
+    });
+#endif  // !defined(GTEST_TEST)
 }  // namespace xgboost::tree
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -31,7 +31,7 @@ class TreePruner : public TreeUpdater {
  [[nodiscard]] bool CanModifyTree() const override { return true; }

  // update the tree, do pruning
-  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree*>& trees) override {
    pruner_monitor_.Start("PrunerUpdate");
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -4,39 +4,40 @@
 * \brief use quantized feature values to construct a tree
 * \author Philip Cho, Tianqi Checn, Egor Smirnov
 */
-#include <algorithm>                         // for max, copy, transform
-#include <cstddef>                           // for size_t
-#include <cstdint>                           // for uint32_t, int32_t
-#include <memory>                            // for unique_ptr, allocator, make_unique, shared_ptr
-#include <numeric>                           // for accumulate
-#include <ostream>                           // for basic_ostream, char_traits, operator<<
-#include <utility>                           // for move, swap
-#include <vector>                            // for vector
+#include <algorithm>  // for max, copy, transform
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t, int32_t
+#include <exception>  // for exception
+#include <memory>     // for allocator, unique_ptr, make_unique, shared_ptr
+#include <ostream>    // for operator<<, basic_ostream, char_traits
+#include <utility>    // for move
+#include <vector>     // for vector

 #include "../collective/aggregator.h"        // for GlobalSum
-#include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
-#include "../collective/communicator.h"      // for Operation
-#include "../common/hist_util.h"             // for HistogramCuts, HistCollection
+#include "../collective/communicator-inl.h"  // for IsDistributed
+#include "../common/hist_util.h"             // for HistogramCuts, GHistRow
 #include "../common/linalg_op.h"             // for begin, cbegin, cend
 #include "../common/random.h"                // for ColumnSampler
 #include "../common/threading_utils.h"       // for ParallelFor
 #include "../common/timer.h"                 // for Monitor
-#include "../common/transform_iterator.h"    // for IndexTransformIter, MakeIndexTransformIter
+#include "../common/transform_iterator.h"    // for IndexTransformIter
 #include "../data/gradient_index.h"          // for GHistIndexMatrix
 #include "common_row_partitioner.h"          // for CommonRowPartitioner
-#include "dmlc/omp.h"                        // for omp_get_thread_num
 #include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
 #include "driver.h"                          // for Driver
 #include "hist/evaluate_splits.h"            // for HistEvaluator, HistMultiEvaluator, UpdatePre...
 #include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
-#include "hist/histogram.h"                  // for HistogramBuilder, ConstructHistSpace
+#include "hist/hist_cache.h"                 // for BoundedHistCollection
+#include "hist/histogram.h"                  // for MultiHistogramBuilder
+#include "hist/param.h"                      // for HistMakerTrainParam
 #include "hist/sampler.h"                    // for SampleGradient
-#include "param.h"                           // for TrainParam, SplitEntryContainer, GradStats
-#include "xgboost/base.h"                    // for GradientPairInternal, GradientPair, bst_targ...
+#include "param.h"                           // for TrainParam, GradStats
+#include "xgboost/base.h"                    // for Args, GradientPairPrecise, GradientPair, Gra...
 #include "xgboost/context.h"                 // for Context
-#include "xgboost/data.h"                    // for BatchIterator, BatchSet, DMatrix, MetaInfo
+#include "xgboost/data.h"                    // for BatchSet, DMatrix, BatchIterator, MetaInfo
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
-#include "xgboost/linalg.h"                  // for All, MatrixView, TensorView, Matrix, Empty
+#include "xgboost/json.h"                    // for Object, Json, FromJson, ToJson, get
+#include "xgboost/linalg.h"                  // for MatrixView, TensorView, All, Matrix, Empty
 #include "xgboost/logging.h"                 // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
 #include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"             // for operator<<
@@ -117,10 +118,11 @@ class MultiTargetHistBuilder {
 private:
  common::Monitor *monitor_{nullptr};
  TrainParam const *param_{nullptr};
+  HistMakerTrainParam const *hist_param_{nullptr};
  std::shared_ptr<common::ColumnSampler> col_sampler_;
  std::unique_ptr<HistMultiEvaluator> evaluator_;
  // Histogram builder for each target.
-  std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
+  std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
  Context const *ctx_{nullptr};
  // Partitioner for each data batch.
  std::vector<CommonRowPartitioner> partitioner_;
@@ -150,7 +152,6 @@ class MultiTargetHistBuilder {
    monitor_->Start(__func__);

    p_last_fmat_ = p_fmat;
-    std::size_t page_id = 0;
    bst_bin_t n_total_bins = 0;
    partitioner_.clear();
    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
@@ -160,16 +161,13 @@ class MultiTargetHistBuilder {
        CHECK_EQ(n_total_bins, page.cut.TotalBins());
      }
      partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->Info().IsColumnSplit());
-      page_id++;
    }

    bst_target_t n_targets = p_tree->NumTargets();
-    histogram_builder_.clear();
-    for (std::size_t i = 0; i < n_targets; ++i) {
-      histogram_builder_.emplace_back();
-      histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
-                                      collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
-    }
+    histogram_builder_ = std::make_unique<MultiHistogramBuilder>();
+    histogram_builder_->Reset(ctx_, n_total_bins, n_targets, HistBatch(param_),
+                              collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
+                              hist_param_);

    evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
    p_last_tree_ = p_tree;
@@ -204,17 +202,7 @@ class MultiTargetHistBuilder {
    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
                          root_sum.Size() * 2);

-    std::vector<MultiExpandEntry> nodes{best};
-    std::size_t i = 0;
-    auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      for (bst_target_t t{0}; t < n_targets; ++t) {
-        auto t_gpair = gpair.Slice(linalg::All(), t);
-        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
-                                        nodes, {}, t_gpair.Values());
-      }
-      i++;
-    }
+    histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));

    auto weight = evaluator_->InitRoot(root_sum);
    auto weight_t = weight.HostView();
@@ -222,9 +210,10 @@ class MultiTargetHistBuilder {
                   [&](float w) { return w * param_->learning_rate; });

    p_tree->SetLeaf(RegTree::kRoot, weight_t);
-    std::vector<common::HistCollection const *> hists;
+    std::vector<BoundedHistCollection const *> hists;
+    std::vector<MultiExpandEntry> nodes{{RegTree::kRoot, 0}};
    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
-      hists.push_back(&histogram_builder_[t].Histogram());
+      hists.push_back(&(*histogram_builder_).Histogram(t));
    }
    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
@@ -239,50 +228,17 @@ class MultiTargetHistBuilder {
                      std::vector<MultiExpandEntry> const &valid_candidates,
                      linalg::MatrixView<GradientPair const> gpair) {
    monitor_->Start(__func__);
-    std::vector<MultiExpandEntry> nodes_to_build;
-    std::vector<MultiExpandEntry> nodes_to_sub;
-
-    for (auto const &c : valid_candidates) {
-      auto left_nidx = p_tree->LeftChild(c.nid);
-      auto right_nidx = p_tree->RightChild(c.nid);
-
-      auto build_nidx = left_nidx;
-      auto subtract_nidx = right_nidx;
-      auto lit =
-          common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
-      auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
-      auto rit =
-          common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
-      auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
-      auto fewer_right = right_sum < left_sum;
-      if (fewer_right) {
-        std::swap(build_nidx, subtract_nidx);
-      }
-      nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
-      nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
-    }
-
-    std::size_t i = 0;
-    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
-        auto t_gpair = gpair.Slice(linalg::All(), t);
-        // Make sure the gradient matrix is f-order.
-        CHECK(t_gpair.Contiguous());
-        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
-                                        nodes_to_build, nodes_to_sub, t_gpair.Values());
-      }
-      i++;
-    }
+    histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
+                                           HistBatch(param_));
    monitor_->Stop(__func__);
  }

  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
                      std::vector<MultiExpandEntry> *best_splits) {
    monitor_->Start(__func__);
-    std::vector<common::HistCollection const *> hists;
+    std::vector<BoundedHistCollection const *> hists;
    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
-      hists.push_back(&histogram_builder_[t].Histogram());
+      hists.push_back(&(*histogram_builder_).Histogram(t));
    }
    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
@@ -306,10 +262,12 @@ class MultiTargetHistBuilder {

 public:
  explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
+                                  HistMakerTrainParam const *hist_param,
                                  std::shared_ptr<common::ColumnSampler> column_sampler,
                                  ObjInfo const *task, common::Monitor *monitor)
      : monitor_{monitor},
        param_{param},
+        hist_param_{hist_param},
        col_sampler_{std::move(column_sampler)},
        evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
        ctx_{ctx},
@@ -331,10 +289,14 @@ class MultiTargetHistBuilder {
  }
 };

-class HistBuilder {
+/**
+ * @brief Tree updater for single-target trees.
+ */
+class HistUpdater {
 private:
  common::Monitor *monitor_;
  TrainParam const *param_;
+  HistMakerTrainParam const *hist_param_{nullptr};
  std::shared_ptr<common::ColumnSampler> col_sampler_;
  std::unique_ptr<HistEvaluator> evaluator_;
  std::vector<CommonRowPartitioner> partitioner_;
@@ -343,22 +305,22 @@ class HistBuilder {
  const RegTree *p_last_tree_{nullptr};
  DMatrix const *const p_last_fmat_{nullptr};

-  std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
+  std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
  ObjInfo const *task_{nullptr};
  // Context for number of threads
  Context const *ctx_{nullptr};

 public:
-  explicit HistBuilder(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
-                       TrainParam const *param, DMatrix const *fmat, ObjInfo const *task,
-                       common::Monitor *monitor)
+  explicit HistUpdater(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
+                       TrainParam const *param, HistMakerTrainParam const *hist_param,
+                       DMatrix const *fmat, ObjInfo const *task, common::Monitor *monitor)
      : monitor_{monitor},
        param_{param},
+        hist_param_{hist_param},
        col_sampler_{std::move(column_sampler)},
-        evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
-                                                                   col_sampler_)},
+        evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(), col_sampler_)},
        p_last_fmat_(fmat),
-        histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
+        histogram_builder_{new MultiHistogramBuilder},
        task_{task},
        ctx_{ctx} {
    monitor_->Init(__func__);
@@ -381,7 +343,6 @@ class HistBuilder {
  // initialize temp data structure
  void InitData(DMatrix *fmat, RegTree const *p_tree) {
    monitor_->Start(__func__);
-    std::size_t page_id{0};
    bst_bin_t n_total_bins{0};
    partitioner_.clear();
    for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
@@ -392,10 +353,9 @@ class HistBuilder {
      }
      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid,
                                fmat->Info().IsColumnSplit());
-      ++page_id;
    }
-    histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
-                              collective::IsDistributed(), fmat->Info().IsColumnSplit());
+    histogram_builder_->Reset(ctx_, n_total_bins, 1, HistBatch(param_), collective::IsDistributed(),
+                              fmat->Info().IsColumnSplit(), hist_param_);
    evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
    p_last_tree_ = p_tree;
    monitor_->Stop(__func__);
@@ -404,7 +364,7 @@ class HistBuilder {
  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
                      std::vector<CPUExpandEntry> *best_splits) {
    monitor_->Start(__func__);
-    auto const &histograms = histogram_builder_->Histogram();
+    auto const &histograms = histogram_builder_->Histogram(0);
    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
      evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
@@ -422,16 +382,8 @@ class HistBuilder {
    monitor_->Start(__func__);
    CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));

-    std::size_t page_id = 0;
-    auto space = ConstructHistSpace(partitioner_, {node});
-    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      std::vector<CPUExpandEntry> nodes_to_build{node};
-      std::vector<CPUExpandEntry> nodes_to_sub;
-      this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
-                                          partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                          nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
-      ++page_id;
-    }
+    this->histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, node,
+                                            HistBatch(param_));

    {
      GradientPairPrecise grad_stat;
@@ -445,7 +397,7 @@ class HistBuilder {
        CHECK_GE(row_ptr.size(), 2);
        std::uint32_t const ibegin = row_ptr[0];
        std::uint32_t const iend = row_ptr[1];
-        auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
+        auto hist = this->histogram_builder_->Histogram(0)[RegTree::kRoot];
        auto begin = hist.data();
        for (std::uint32_t i = ibegin; i < iend; ++i) {
          GradientPairPrecise const &et = begin[i];
@@ -468,7 +420,7 @@ class HistBuilder {
      monitor_->Start("EvaluateSplits");
      auto ft = p_fmat->Info().feature_types.ConstHostSpan();
      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-        evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
+        evaluator_->EvaluateSplits(histogram_builder_->Histogram(0), gmat.cut, ft, *p_tree,
                                   &entries);
        break;
      }
@@ -484,33 +436,8 @@ class HistBuilder {
                      std::vector<CPUExpandEntry> const &valid_candidates,
                      linalg::MatrixView<GradientPair const> gpair) {
    monitor_->Start(__func__);
-    std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
-    std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
-
-    std::size_t n_idx = 0;
-    for (auto const &c : valid_candidates) {
-      auto left_nidx = (*p_tree)[c.nid].LeftChild();
-      auto right_nidx = (*p_tree)[c.nid].RightChild();
-      auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
-
-      auto build_nidx = left_nidx;
-      auto subtract_nidx = right_nidx;
-      if (fewer_right) {
-        std::swap(build_nidx, subtract_nidx);
-      }
-      nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
-      nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
-      n_idx++;
-    }
-
-    std::size_t page_id{0};
-    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
-                                    partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                    nodes_to_sub, gpair.Values());
-      ++page_id;
-    }
+    this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
+                                                 gpair, HistBatch(param_));
    monitor_->Stop(__func__);
  }

@@ -529,7 +456,7 @@ class HistBuilder {
                     std::vector<bst_node_t> *p_out_position) {
    monitor_->Start(__func__);
    if (!task_->UpdateTreeLeaf()) {
-    monitor_->Stop(__func__);
+      monitor_->Stop(__func__);
      return;
    }
    for (auto const &part : partitioner_) {
@@ -541,42 +468,50 @@ class HistBuilder {

 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker : public TreeUpdater {
-  std::unique_ptr<HistBuilder> p_impl_{nullptr};
+  std::unique_ptr<HistUpdater> p_impl_{nullptr};
  std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
  std::shared_ptr<common::ColumnSampler> column_sampler_ =
      std::make_shared<common::ColumnSampler>();
  common::Monitor monitor_;
  ObjInfo const *task_{nullptr};
+  HistMakerTrainParam hist_param_;

 public:
  explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
      : TreeUpdater{ctx}, task_{task} {}
-  void Configure(const Args &) override {}

-  void LoadConfig(Json const &) override {}
-  void SaveConfig(Json *) const override {}
+  void Configure(Args const &args) override { hist_param_.UpdateAllowUnknown(args); }
+  void LoadConfig(Json const &in) override {
+    auto const &config = get<Object const>(in);
+    FromJson(config.at("hist_train_param"), &hist_param_);
+  }
+  void SaveConfig(Json *p_out) const override {
+    auto &out = *p_out;
+    out["hist_train_param"] = ToJson(hist_param_);
+  }

  [[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }

-  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree *> &trees) override {
    if (trees.front()->IsMultiTarget()) {
+      CHECK(hist_param_.GetInitialised());
      CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
      if (!p_mtimpl_) {
        this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
-            ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
+            ctx_, p_fmat->Info(), param, &hist_param_, column_sampler_, task_, &monitor_);
      }
    } else {
+      CHECK(hist_param_.GetInitialised());
      if (!p_impl_) {
-        p_impl_ =
-            std::make_unique<HistBuilder>(ctx_, column_sampler_, param, p_fmat, task_, &monitor_);
+        p_impl_ = std::make_unique<HistUpdater>(ctx_, column_sampler_, param, &hist_param_, p_fmat,
+                                                task_, &monitor_);
      }
    }

    bst_target_t n_targets = trees.front()->NumTargets();
-    auto h_gpair =
-        linalg::MakeTensorView(ctx_, gpair->HostSpan(), p_fmat->Info().num_row_, n_targets);
+    auto h_gpair = gpair->HostView();

    linalg::Matrix<GradientPair> sample_out;
    auto h_sample_out = h_gpair;
@@ -601,6 +536,8 @@ class QuantileHistMaker : public TreeUpdater {
        UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
                                   h_out_position, *tree_it);
      }
+
+      hist_param_.CheckTreesSynchronized(*tree_it);
    }
  }

--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -31,11 +31,14 @@ class TreeRefresher : public TreeUpdater {
  [[nodiscard]] char const *Name() const override { return "refresh"; }
  [[nodiscard]] bool CanModifyTree() const override { return true; }
  // update the tree, do pruning
-  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
              common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
              const std::vector<RegTree *> &trees) override {
-    if (trees.size() == 0) return;
-    const std::vector<GradientPair> &gpair_h = gpair->ConstHostVector();
+    if (trees.size() == 0) {
+      return;
+    }
+    CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
+    const std::vector<GradientPair> &gpair_h = gpair->Data()->ConstHostVector();
    // thread temporal space
    std::vector<std::vector<GradStats> > stemp;
    std::vector<RegTree::FVec> fvec_temp;
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -31,7 +31,7 @@ class TreeSyncher : public TreeUpdater {

  [[nodiscard]] char const* Name() const override { return "prune"; }

-  void Update(TrainParam const*, HostDeviceVector<GradientPair>*, DMatrix*,
+  void Update(TrainParam const*, linalg::Matrix<GradientPair>*, DMatrix*,
              common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
              const std::vector<RegTree*>& trees) override {
    if (collective::GetWorldSize() == 1) return;