[EM] Make page concatenation optional. (#10826)

This PR introduces a new parameter `extmem_concat_pages` to make the page concatenation optional for GPU hist. In addition, the document is updated for the new GPU-based external memory.
2024-09-24 06:19:28 +08:00
parent 215da76263
commit e228c1a121
31 changed files with 690 additions and 388 deletions
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -106,9 +106,10 @@ inline auto NoCategorical(std::string name) {
  return name + " doesn't support categorical features.";
 }

-inline void NoOnHost(bool on_host) {
-  if (on_host) {
-    LOG(FATAL) << "Caching on host memory is only available for GPU.";
+inline void NoPageConcat(bool concat_pages) {
+  if (concat_pages) {
+    LOG(FATAL) << "`extmem_concat_pages` must be false when there's no sampling or when it's "
+                  "running on the CPU.";
  }
 }
 }  // namespace xgboost::error
--- a/src/data/batch_utils.cuh
+++ b/src/data/batch_utils.cuh
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#pragma once
+
+#include "xgboost/data.h"  // for BatchParam
+
+namespace xgboost::data::cuda_impl {
+// Use two batch for prefecting. There's always one batch being worked on, while the other
+// batch being transferred.
+constexpr auto DftPrefetchBatches() { return 2; }
+
+// Empty parameter to prevent regen, only used to control external memory prefetching.
+//
+// Both the approx and hist initializes the DMatrix before creating the actual
+// implementation (InitDataOnce). Therefore, the `GPUHistMakerDevice` can use an empty
+// parameter to avoid any regen.
+inline BatchParam StaticBatch(bool prefetch_copy) {
+  BatchParam p;
+  p.prefetch_copy = prefetch_copy;
+  p.n_prefetch_batches = DftPrefetchBatches();
+  return p;
+}
+}  // namespace xgboost::data::cuda_impl
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -920,7 +920,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
                                       data::fileiter::Next,
                                       std::numeric_limits<float>::quiet_NaN(),
                                       1,
-                                       cache_file};
+                                       cache_file,
+                                       false};
  }

  return dmat;
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -10,7 +10,7 @@
 #include <utility>  // for move
 #include <vector>   // for vector

-#include "../common/cuda_rt_utils.h"  // for SupportsPageableMem
+#include "../common/cuda_rt_utils.h"  // for SupportsPageableMem, SupportsAts
 #include "../common/hist_util.h"      // for HistogramCuts
 #include "ellpack_page.h"             // for EllpackPage
 #include "ellpack_page_raw_format.h"  // for EllpackPageRawFormat
@@ -67,7 +67,20 @@ class EllpackFormatPolicy {
  using FormatT = EllpackPageRawFormat;

 public:
-  EllpackFormatPolicy() = default;
+  EllpackFormatPolicy() {
+    StringView msg{" The overhead of iterating through external memory might be significant."};
+    if (!has_hmm_) {
+      LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg;
+    } else if (!common::SupportsAts()) {
+      LOG(WARNING) << "CUDA address translation service is not available." << msg;
+    }
+#if !defined(XGBOOST_USE_RMM)
+    LOG(WARNING) << "XGBoost is not built with RMM support." << msg;
+#endif
+    if (!GlobalConfigThreadLocalStore::Get()->use_rmm) {
+      LOG(WARNING) << "`use_rmm` is set to false." << msg;
+    }
+  }
  // For testing with the HMM flag.
  explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}

@@ -135,6 +148,9 @@ class EllpackMmapStreamPolicy : public F<S> {
                                                      bst_idx_t length) const;
 };

+/**
+ * @brief Ellpack source with sparse pages as the underlying source.
+ */
 template <typename F>
 class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
  using Super = PageSourceIncMixIn<EllpackPage, F>;
@@ -171,6 +187,9 @@ using EllpackPageHostSource =
 using EllpackPageSource =
    EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;

+/**
+ * @brief Ellpack source directly interfaces with user-defined iterators.
+ */
 template <typename FormatCreatePolicy>
 class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, FormatCreatePolicy> {
  using Super = ExtQantileSourceMixin<EllpackPage, FormatCreatePolicy>;
@@ -201,6 +220,7 @@ class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, Forma
        info_{info},
        ext_info_{std::move(ext_info)},
        base_rows_{std::move(base_rows)} {
+    cuts->SetDevice(ctx->Device());
    this->SetCuts(std::move(cuts), ctx->Device());
    this->Fetch();
  }
--- a/src/data/extmem_quantile_dmatrix.cc
+++ b/src/data/extmem_quantile_dmatrix.cc
@@ -13,6 +13,7 @@
 #include "proxy_dmatrix.h"          // for DataIterProxy, HostAdapterDispatch
 #include "quantile_dmatrix.h"       // for GetDataShape, MakeSketches
 #include "simple_batch_iterator.h"  // for SimpleBatchIteratorImpl
+#include "sparse_page_source.h"     // for MakeCachePrefix

 #if !defined(XGBOOST_USE_CUDA)
 #include "../common/common.h"  // for AssertGPUSupport
@@ -26,6 +27,7 @@ ExtMemQuantileDMatrix::ExtMemQuantileDMatrix(DataIterHandle iter_handle, DMatrix
                                             std::int32_t n_threads, std::string cache,
                                             bst_bin_t max_bin, bool on_host)
    : cache_prefix_{std::move(cache)}, on_host_{on_host} {
+  cache_prefix_ = MakeCachePrefix(cache_prefix_);
  auto iter = std::make_shared<DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>>(
      iter_handle, reset, next);
  iter->Reset();
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -13,9 +13,9 @@
 #include <utility>    // for move
 #include <variant>    // for visit

-#include "../collective/communicator-inl.h"
-#include "batch_utils.h"  // for RegenGHist
-#include "gradient_index.h"
+#include "batch_utils.h"         // for RegenGHist
+#include "gradient_index.h"      // for GHistIndexMatrix
+#include "sparse_page_source.h"  // for MakeCachePrefix

 namespace xgboost::data {
 MetaInfo &SparsePageDMatrix::Info() { return info_; }
@@ -34,12 +34,9 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
      cache_prefix_{std::move(cache_prefix)},
      on_host_{on_host} {
  Context ctx;
-  ctx.nthread = nthreads;
+  ctx.Init(Args{{"nthread", std::to_string(nthreads)}});
+  cache_prefix_ = MakeCachePrefix(cache_prefix_);

-  cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
-  if (collective::IsDistributed()) {
-    cache_prefix_ += ("-r" + std::to_string(collective::GetRank()));
-  }
  DMatrixProxy *proxy = MakeProxy(proxy_);
  auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
      iter_, reset_, next_};
@@ -107,7 +104,6 @@ BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
 BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
  auto id = MakeCache(this, ".col.page", on_host_, cache_prefix_, &cache_info_);
  CHECK_NE(this->Info().num_col_, 0);
-  error::NoOnHost(on_host_);
  this->InitializeSparsePage(ctx);
  if (!column_source_) {
    column_source_ =
@@ -122,7 +118,6 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
 BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const *ctx) {
  auto id = MakeCache(this, ".sorted.col.page", on_host_, cache_prefix_, &cache_info_);
  CHECK_NE(this->Info().num_col_, 0);
-  error::NoOnHost(on_host_);
  this->InitializeSparsePage(ctx);
  if (!sorted_column_source_) {
    sorted_column_source_ = std::make_shared<SortedCSCPageSource>(
@@ -140,7 +135,6 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ct
    CHECK_GE(param.max_bin, 2);
  }
  detail::CheckEmpty(batch_param_, param);
-  error::NoOnHost(on_host_);
  auto id = MakeCache(this, ".gradient_index.page", on_host_, cache_prefix_, &cache_info_);
  if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
    this->InitializeSparsePage(ctx);
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -70,10 +70,10 @@ class SparsePageDMatrix : public DMatrix {
  DataIterResetCallback *reset_;
  XGDMatrixCallbackNext *next_;

-  float missing_;
+  float const missing_;
  Context fmat_ctx_;
  std::string cache_prefix_;
-  bool on_host_{false};
+  bool const on_host_;
  std::uint32_t n_batches_{0};
  // sparse page is the source to other page types, we make a special member function.
  void InitializeSparsePage(Context const *ctx);
@@ -83,7 +83,7 @@ class SparsePageDMatrix : public DMatrix {
 public:
  explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
                             XGDMatrixCallbackNext *next, float missing, int32_t nthreads,
-                             std::string cache_prefix, bool on_host = false);
+                             std::string cache_prefix, bool on_host);

  ~SparsePageDMatrix() override;

--- a/src/data/sparse_page_raw_format.cc
+++ b/src/data/sparse_page_raw_format.cc
@@ -54,22 +54,18 @@ class SparsePageRawFormat : public SparsePageFormat<T> {
 private:
 };

-XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
-.describe("Raw binary data format.")
-.set_body([]() {
-    return new SparsePageRawFormat<SparsePage>();
-  });
+#define SparsePageFmt SparsePageFormat<SparsePage>
+DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SparsePage>, SparsePageFmt, raw)
+    .describe("Raw binary data format.")
+    .set_body([]() { return new SparsePageRawFormat<SparsePage>(); });

-XGBOOST_REGISTER_CSC_PAGE_FORMAT(raw)
-.describe("Raw binary data format.")
-.set_body([]() {
-    return new SparsePageRawFormat<CSCPage>();
-  });
-
-XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
-.describe("Raw binary data format.")
-.set_body([]() {
-    return new SparsePageRawFormat<SortedCSCPage>();
-  });
+#define CSCPageFmt SparsePageFormat<CSCPage>
+DMLC_REGISTRY_REGISTER(SparsePageFormatReg<CSCPage>, CSCPageFmt, raw)
+    .describe("Raw binary data format.")
+    .set_body([]() { return new SparsePageRawFormat<CSCPage>(); });

+#define SortedCSCPageFmt SparsePageFormat<SortedCSCPage>
+DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SortedCSCPage>, SortedCSCPageFmt, raw)
+    .describe("Raw binary data format.")
+    .set_body([]() { return new SparsePageRawFormat<SortedCSCPage>(); });
 }  // namespace xgboost::data
--- a/src/data/sparse_page_source.cc
+++ b/src/data/sparse_page_source.cc
@@ -8,6 +8,8 @@
 #include <numeric>     // for partial_sum
 #include <string>      // for string

+#include "../collective/communicator-inl.h"  // for IsDistributed, GetRank
+
 namespace xgboost::data {
 void Cache::Commit() {
  if (!this->written) {
@@ -28,6 +30,14 @@ void TryDeleteCacheFile(const std::string& file) {
  }
 }

+std::string MakeCachePrefix(std::string cache_prefix) {
+  cache_prefix = cache_prefix.empty() ? "DMatrix" : cache_prefix;
+  if (collective::IsDistributed()) {
+    cache_prefix += ("-r" + std::to_string(collective::GetRank()));
+  }
+  return cache_prefix;
+}
+
 #if !defined(XGBOOST_USE_CUDA)
 void InitNewThread::operator()() const { *GlobalConfigThreadLocalStore::Get() = config; }
 #endif
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -33,6 +33,8 @@
 namespace xgboost::data {
 void TryDeleteCacheFile(const std::string& file);

+std::string MakeCachePrefix(std::string cache_prefix);
+
 /**
 * @brief Information about the cache including path and page offsets.
 */
--- a/src/data/sparse_page_writer.h
+++ b/src/data/sparse_page_writer.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023, XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
 * \file sparse_page_writer.h
 * \author Tianqi Chen
 */
@@ -11,7 +11,6 @@

 #include "../common/io.h"   // for AlignedResourceReadStream, AlignedFileWriteStream
 #include "dmlc/registry.h"  // for Registry, FunctionRegEntryBase
-#include "xgboost/data.h"   // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...

 namespace xgboost::data {
 template<typename T>
@@ -54,47 +53,13 @@ inline SparsePageFormat<T>* CreatePageFormat(const std::string& name) {
  return (e->body)();
 }

-/*!
- * \brief Registry entry for sparse page format.
+/**
+ * @brief Registry entry for sparse page format.
 */
 template<typename T>
 struct SparsePageFormatReg
    : public dmlc::FunctionRegEntryBase<SparsePageFormatReg<T>,
                                        std::function<SparsePageFormat<T>* ()>> {
 };
-
-/*!
- * \brief Macro to register sparse page format.
- *
- * \code
- * // example of registering a objective
- * XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
- * .describe("Raw binary data format.")
- * .set_body([]() {
- *     return new RawFormat();
- *   });
- * \endcode
- */
-#define SparsePageFmt SparsePageFormat<SparsePage>
-#define XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(Name)                       \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SparsePage>, SparsePageFmt, Name)
-
-#define CSCPageFmt SparsePageFormat<CSCPage>
-#define XGBOOST_REGISTER_CSC_PAGE_FORMAT(Name)                       \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<CSCPage>, CSCPageFmt, Name)
-
-#define SortedCSCPageFmt SparsePageFormat<SortedCSCPage>
-#define XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(Name)                       \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<SortedCSCPage>, SortedCSCPageFmt, Name)
-
-#define EllpackPageFmt SparsePageFormat<EllpackPage>
-#define XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(Name)                       \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<EllpackPage>, EllpackPageFmt, Name)
-
-#define GHistIndexPageFmt SparsePageFormat<GHistIndexMatrix>
-#define XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(Name)                         \
-  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>,                \
-                         GHistIndexPageFmt, Name)
-
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -14,9 +14,10 @@
 #include "../common/categorical.h"
 #include "../common/common.h"
 #include "../common/cuda_context.cuh"  // for CUDAContext
-#include "../common/cuda_rt_utils.h"   // for AllVisibleGPUs
+#include "../common/cuda_rt_utils.h"   // for AllVisibleGPUs, SetDevice
 #include "../common/device_helpers.cuh"
-#include "../common/error_msg.h"  // for InplacePredictProxy
+#include "../common/error_msg.h"    // for InplacePredictProxy
+#include "../data/batch_utils.cuh"  // for StaticBatch
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
@@ -31,6 +32,8 @@
 namespace xgboost::predictor {
 DMLC_REGISTRY_FILE_TAG(gpu_predictor);

+using data::cuda_impl::StaticBatch;
+
 struct TreeView {
  RegTree::CategoricalSplitMatrix cats;
  common::Span<RegTree::Node const> d_tree;
@@ -475,15 +478,14 @@ struct PathInfo {
 };

 // Transform model into path element form for GPUTreeShap
-void ExtractPaths(
-    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
-    DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
-    DeviceOrd device) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+void ExtractPaths(Context const* ctx,
+                  dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
+                  DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
+                  DeviceOrd device) {
+  common::SetDevice(device.ordinal);
  auto& device_model = *model;

  dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
-  dh::XGBCachingDeviceAllocator<PathInfo> alloc;
  auto d_nodes = device_model.nodes.ConstDeviceSpan();
  auto d_tree_segments = device_model.tree_segments.ConstDeviceSpan();
  auto nodes_transform = dh::MakeTransformIterator<PathInfo>(
@@ -502,17 +504,15 @@ void ExtractPaths(
        }
        return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
      });
-  auto end = thrust::copy_if(
-      thrust::cuda::par(alloc), nodes_transform,
-      nodes_transform + d_nodes.size(), info.begin(),
-      [=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
+  auto end = thrust::copy_if(ctx->CUDACtx()->CTP(), nodes_transform,
+                             nodes_transform + d_nodes.size(), info.begin(),
+                             [=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
  info.resize(end - info.begin());
  auto length_iterator = dh::MakeTransformIterator<size_t>(
      info.begin(),
      [=] __device__(const PathInfo& info) { return info.length; });
  dh::caching_device_vector<size_t> path_segments(info.size() + 1);
-  thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator,
-                         length_iterator + info.size() + 1,
+  thrust::exclusive_scan(ctx->CUDACtx()->CTP(), length_iterator, length_iterator + info.size() + 1,
                         path_segments.begin());

  paths->resize(path_segments.back());
@@ -528,19 +528,17 @@ void ExtractPaths(
  auto d_cat_node_segments = device_model.categories_node_segments.ConstDeviceSpan();

  size_t max_cat = 0;
-  if (thrust::any_of(dh::tbegin(d_split_types), dh::tend(d_split_types),
+  if (thrust::any_of(ctx->CUDACtx()->CTP(), dh::tbegin(d_split_types), dh::tend(d_split_types),
                     common::IsCatOp{})) {
    dh::PinnedMemory pinned;
    auto h_max_cat = pinned.GetSpan<RegTree::CategoricalSplitMatrix::Segment>(1);
    auto max_elem_it = dh::MakeTransformIterator<size_t>(
        dh::tbegin(d_cat_node_segments),
        [] __device__(RegTree::CategoricalSplitMatrix::Segment seg) { return seg.size; });
-    size_t max_cat_it =
-        thrust::max_element(thrust::device, max_elem_it,
-                            max_elem_it + d_cat_node_segments.size()) -
-        max_elem_it;
-    dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
-                             d_cat_node_segments.data() + max_cat_it,
+    size_t max_cat_it = thrust::max_element(ctx->CUDACtx()->CTP(), max_elem_it,
+                                            max_elem_it + d_cat_node_segments.size()) -
+                        max_elem_it;
+    dh::safe_cuda(cudaMemcpy(h_max_cat.data(), d_cat_node_segments.data() + max_cat_it,
                             h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
    max_cat = h_max_cat[0].size;
    CHECK_GE(max_cat, 1);
@@ -550,7 +548,7 @@ void ExtractPaths(
  auto d_model_categories = device_model.categories.DeviceSpan();
  common::Span<uint32_t> d_path_categories = dh::ToSpan(*path_categories);

-  dh::LaunchN(info.size(), [=] __device__(size_t idx) {
+  dh::LaunchN(info.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t idx) {
    auto path_info = d_info[idx];
    size_t tree_offset = d_tree_segments[path_info.tree_idx];
    TreeView tree{0,                   path_info.tree_idx, d_nodes,
@@ -864,7 +862,7 @@ class GPUPredictor : public xgboost::Predictor {
    SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                        num_features);
    auto const kernel = [&](auto predict_fn) {
-      dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes}(
+      dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
          predict_fn, data, model.nodes.ConstDeviceSpan(),
          predictions->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
          model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
@@ -888,7 +886,7 @@ class GPUPredictor : public xgboost::Predictor {
    DeviceModel d_model;

    bool use_shared = false;
-    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS}(
+    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, 0, ctx_->CUDACtx()->Stream()}(
        PredictKernel<EllpackLoader, EllpackDeviceAccessor>, batch, model.nodes.ConstDeviceSpan(),
        out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
        model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
@@ -924,7 +922,7 @@ class GPUPredictor : public xgboost::Predictor {
      }
    } else {
      bst_idx_t batch_offset = 0;
-      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
+      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
        dmat->Info().feature_types.SetDevice(ctx_->Device());
        auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
        this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
@@ -989,7 +987,7 @@ class GPUPredictor : public xgboost::Predictor {

    bool use_shared = shared_memory_bytes != 0;

-    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes}(
+    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
        PredictKernel<Loader, typename Loader::BatchT>, m->Value(), d_model.nodes.ConstDeviceSpan(),
        out_preds->predictions.DeviceSpan(), d_model.tree_segments.ConstDeviceSpan(),
        d_model.tree_group.ConstDeviceSpan(), d_model.split_types.ConstDeviceSpan(),
@@ -1055,7 +1053,7 @@ class GPUPredictor : public xgboost::Predictor {
    DeviceModel d_model;
    d_model.Init(model, 0, tree_end, ctx_->Device());
    dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
+    ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
    if (p_fmat->PageExists<SparsePage>()) {
      for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
        batch.data.SetDevice(ctx_->Device());
@@ -1067,7 +1065,7 @@ class GPUPredictor : public xgboost::Predictor {
            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
      }
    } else {
-      for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
+      for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
        EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
        auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
                               std::numeric_limits<float>::quiet_NaN()};
@@ -1083,7 +1081,7 @@ class GPUPredictor : public xgboost::Predictor {

    auto base_score = model.learner_model_param->BaseScore(ctx_);
    dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
-                [=] __device__(size_t idx) {
+                ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
                  phis[(idx + 1) * contributions_columns - 1] +=
                      margin.empty() ? base_score(0) : margin[idx];
                });
@@ -1125,7 +1123,7 @@ class GPUPredictor : public xgboost::Predictor {
    DeviceModel d_model;
    d_model.Init(model, 0, tree_end, ctx_->Device());
    dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
+    ExtractPaths(ctx_, &device_paths, &d_model, &categories, ctx_->Device());
    if (p_fmat->PageExists<SparsePage>()) {
      for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
        batch.data.SetDevice(ctx_->Device());
@@ -1137,7 +1135,7 @@ class GPUPredictor : public xgboost::Predictor {
            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
      }
    } else {
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
        auto impl = batch.Impl();
        auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
        auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
@@ -1155,7 +1153,7 @@ class GPUPredictor : public xgboost::Predictor {
    auto base_score = model.learner_model_param->BaseScore(ctx_);
    size_t n_features = model.learner_model_param->num_feature;
    dh::LaunchN(p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
-                [=] __device__(size_t idx) {
+                ctx_->CUDACtx()->Stream(), [=] __device__(size_t idx) {
                  size_t group = idx % ngroup;
                  size_t row_idx = idx / ngroup;
                  phis[gpu_treeshap::IndexPhiInteractions(row_idx, ngroup, group, n_features,
@@ -1199,7 +1197,7 @@ class GPUPredictor : public xgboost::Predictor {
    bst_feature_t num_features = info.num_col_;

    auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
-      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes}(
+      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()}(
          fn, data, d_model.nodes.ConstDeviceSpan(),
          predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),

@@ -1223,7 +1221,7 @@ class GPUPredictor : public xgboost::Predictor {
      }
    } else {
      bst_idx_t batch_offset = 0;
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
        auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
        launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -148,19 +148,8 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
  CombineGradientPair combine_;
 };

-NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
-
 GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
-                                       DMatrix* dmat) {
-  return {dmat, gpair};
-}
-
-ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
-    : batch_param_{std::move(batch_param)} {}
-
-GradientBasedSample ExternalMemoryNoSampling::Sample(Context const*,
-                                                     common::Span<GradientPair> gpair,
-                                                     DMatrix* p_fmat) {
+                                       DMatrix* p_fmat) {
  return {p_fmat, gpair};
 }

@@ -246,9 +235,10 @@ GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batc
      grad_sum_(n_rows, 0.0f) {}

 GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
-                                                  common::Span<GradientPair> gpair, DMatrix* dmat) {
+                                                  common::Span<GradientPair> gpair,
+                                                  DMatrix* p_fmat) {
  auto cuctx = ctx->CUDACtx();
-  size_t n_rows = dmat->Info().num_row_;
+  size_t n_rows = p_fmat->Info().num_row_;
  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
      ctx, gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);

@@ -257,7 +247,7 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                    RandomWeight(common::GlobalRandom()())));
-  return {dmat, gpair};
+  return {p_fmat, gpair};
 }

 ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
@@ -323,46 +313,46 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c

 GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
                                           const BatchParam& batch_param, float subsample,
-                                           int sampling_method, bool is_external_memory) {
+                                           int sampling_method, bool concat_pages) {
  // The ctx is kept here for future development of stream-based operations.
-  monitor_.Init("gradient_based_sampler");
+  monitor_.Init(__func__);

  bool is_sampling = subsample < 1.0;

-  if (is_sampling) {
-    switch (sampling_method) {
-      case TrainParam::kUniform:
-        if (is_external_memory) {
-          strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
-        } else {
-          strategy_.reset(new UniformSampling(batch_param, subsample));
-        }
-        break;
-      case TrainParam::kGradientBased:
-        if (is_external_memory) {
-          strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
-        } else {
-          strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
-        }
-        break;
-      default:
-        LOG(FATAL) << "unknown sampling method";
+  if (!is_sampling) {
+    strategy_.reset(new NoSampling{});
+    error::NoPageConcat(concat_pages);
+    return;
+  }
+
+  switch (sampling_method) {
+    case TrainParam::kUniform: {
+      if (concat_pages) {
+        strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
+      } else {
+        strategy_.reset(new UniformSampling(batch_param, subsample));
+      }
+      break;
    }
-  } else {
-    if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(batch_param));
-    } else {
-      strategy_.reset(new NoSampling(batch_param));
+    case TrainParam::kGradientBased: {
+      if (concat_pages) {
+        strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
+      } else {
+        strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
+      }
+      break;
    }
+    default:
+      LOG(FATAL) << "unknown sampling method";
  }
 }

 // Sample a DMatrix based on the given gradient pairs.
 GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
                                                 common::Span<GradientPair> gpair, DMatrix* dmat) {
-  monitor_.Start("Sample");
+  monitor_.Start(__func__);
  GradientBasedSample sample = strategy_->Sample(ctx, gpair, dmat);
-  monitor_.Stop("Sample");
+  monitor_.Stop(__func__);
  return sample;
 }

--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -24,31 +24,29 @@ class SamplingStrategy {
  virtual GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                     DMatrix* dmat) = 0;
  virtual ~SamplingStrategy() = default;
+  /**
+   * @brief Whether pages are concatenated after sampling.
+   */
+  [[nodiscard]] virtual bool ConcatPages() const { return false; }
 };

-/*! \brief No sampling in in-memory mode. */
+class ExtMemSamplingStrategy : public SamplingStrategy {
+ public:
+  [[nodiscard]] bool ConcatPages() const final { return true; }
+};
+
+/**
+ * @brief No-op.
+ */
 class NoSampling : public SamplingStrategy {
 public:
-  explicit NoSampling(BatchParam batch_param);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;
-
- private:
-  BatchParam batch_param_;
 };

-/*! \brief No sampling in external memory mode. */
-class ExternalMemoryNoSampling : public SamplingStrategy {
- public:
-  explicit ExternalMemoryNoSampling(BatchParam batch_param);
-  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
-                             DMatrix* dmat) override;
-
- private:
-  BatchParam batch_param_;
-};
-
-/*! \brief Uniform sampling in in-memory mode. */
+/**
+ * @brief Uniform sampling in in-memory mode.
+ */
 class UniformSampling : public SamplingStrategy {
 public:
  UniformSampling(BatchParam batch_param, float subsample);
@@ -61,7 +59,7 @@ class UniformSampling : public SamplingStrategy {
 };

 /*! \brief No sampling in external memory mode. */
-class ExternalMemoryUniformSampling : public SamplingStrategy {
+class ExternalMemoryUniformSampling : public ExtMemSamplingStrategy {
 public:
  ExternalMemoryUniformSampling(size_t n_rows, BatchParam batch_param, float subsample);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
@@ -91,7 +89,7 @@ class GradientBasedSampling : public SamplingStrategy {
 };

 /*! \brief Gradient-based sampling in external memory mode.. */
-class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
+class ExternalMemoryGradientBasedSampling : public ExtMemSamplingStrategy {
 public:
  ExternalMemoryGradientBasedSampling(size_t n_rows, BatchParam batch_param, float subsample);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
@@ -120,7 +118,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
 class GradientBasedSampler {
 public:
  GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
-                       float subsample, int sampling_method, bool is_external_memory);
+                       float subsample, int sampling_method, bool concat_pages);

  /*! \brief Sample from a DMatrix based on the given gradient pairs. */
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
@@ -130,6 +128,8 @@ class GradientBasedSampler {
                                        common::Span<float> threshold, common::Span<float> grad_sum,
                                        size_t sample_rows);

+  [[nodiscard]] bool ConcatPages() const { return this->strategy_->ConcatPages(); }
+
 private:
  common::Monitor monitor_;
  std::unique_ptr<SamplingStrategy> strategy_;
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -23,6 +23,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
  constexpr static std::size_t CudaDefaultNodes() { return static_cast<std::size_t>(1) << 12; }

  bool debug_synchronize{false};
+  bool extmem_concat_pages{false};

  void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;

@@ -42,6 +43,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
        .set_default(NotSet())
        .set_lower_bound(1)
        .describe("Maximum number of nodes in histogram cache.");
+    DMLC_DECLARE_FIELD(extmem_concat_pages).set_default(false);
  }
 };
 }  // namespace xgboost::tree
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -278,7 +278,7 @@ class GlobalApproxUpdater : public TreeUpdater {
    *sampled = linalg::Empty<GradientPair>(ctx_, gpair->Size(), 1);
    auto in = gpair->HostView().Values();
    std::copy(in.data(), in.data() + in.size(), sampled->HostView().Values().data());
-
+    error::NoPageConcat(this->hist_param_.extmem_concat_pages);
    SampleGradient(ctx_, param, sampled->HostView());
  }

--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -5,10 +5,11 @@
 #include <limits>   // for numeric_limits
 #include <ostream>  // for ostream

-#include "gpu_hist/quantiser.cuh"  // for GradientQuantiser
-#include "param.h"                 // for TrainParam
-#include "xgboost/base.h"          // for bst_bin_t
-#include "xgboost/task.h"          // for ObjInfo
+#include "../data/batch_utils.cuh"  // for DftPrefetchBatches, StaticBatch
+#include "gpu_hist/quantiser.cuh"   // for GradientQuantiser
+#include "param.h"                  // for TrainParam
+#include "xgboost/base.h"           // for bst_bin_t
+#include "xgboost/task.h"           // for ObjInfo

 namespace xgboost::tree {
 struct GPUTrainingParam {
@@ -119,26 +120,19 @@ struct DeviceSplitCandidate {
 };

 namespace cuda_impl {
-constexpr auto DftPrefetchBatches() { return 2; }
-
 inline BatchParam HistBatch(TrainParam const& param) {
  auto p = BatchParam{param.max_bin, TrainParam::DftSparseThreshold()};
  p.prefetch_copy = true;
-  p.n_prefetch_batches = DftPrefetchBatches();
+  p.n_prefetch_batches = data::cuda_impl::DftPrefetchBatches();
  return p;
 }

 inline BatchParam ApproxBatch(TrainParam const& p, common::Span<float const> hess,
                              ObjInfo const& task) {
-  return BatchParam{p.max_bin, hess, !task.const_hess};
-}
-
-// Empty parameter to prevent regen, only used to control external memory prefetching.
-inline BatchParam StaticBatch(bool prefetch_copy) {
-  BatchParam p;
-  p.prefetch_copy = prefetch_copy;
-  p.n_prefetch_batches = DftPrefetchBatches();
-  return p;
+  auto batch = BatchParam{p.max_bin, hess, !task.const_hess};
+  batch.prefetch_copy = true;
+  batch.n_prefetch_batches = data::cuda_impl::DftPrefetchBatches();
+  return batch;
 }
 }  // namespace cuda_impl

--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -21,6 +21,7 @@
 #include "../common/hist_util.h"        // for HistogramCuts
 #include "../common/random.h"           // for ColumnSampler, GlobalRandom
 #include "../common/timer.h"
+#include "../data/batch_utils.cuh"  // for StaticBatch
 #include "../data/ellpack_page.cuh"
 #include "../data/ellpack_page.h"
 #include "constraints.cuh"
@@ -50,11 +51,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);

 using cuda_impl::ApproxBatch;
 using cuda_impl::HistBatch;
-
-// Both the approx and hist initializes the DMatrix before creating the actual
-// implementation (InitDataOnce). Therefore, the `GPUHistMakerDevice` can use an empty
-// parameter to avoid any regen.
-using cuda_impl::StaticBatch;
+using data::cuda_impl::StaticBatch;

 // Extra data for each node that is passed to the update position function
 struct NodeSplitData {
@@ -102,11 +99,11 @@ struct GPUHistMakerDevice {
  std::vector<std::unique_ptr<RowPartitioner>> partitioners_;

  DeviceHistogramBuilder histogram_;
-  std::vector<bst_idx_t> batch_ptr_;
+  std::vector<bst_idx_t> const batch_ptr_;
  // node idx for each sample
  dh::device_vector<bst_node_t> positions_;
  HistMakerTrainParam const* hist_param_;
-  std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
+  std::shared_ptr<common::HistogramCuts const> const cuts_;

  auto CreatePartitionNodes(RegTree const* p_tree, std::vector<GPUExpandEntry> const& candidates) {
    std::vector<bst_node_t> nidx(candidates.size());
@@ -135,35 +132,35 @@ struct GPUHistMakerDevice {

  dh::device_vector<int> monotone_constraints;

-  TrainParam param;
+  TrainParam const param;

  std::unique_ptr<GradientQuantiser> quantiser;

  dh::PinnedMemory pinned;
  dh::PinnedMemory pinned2;

-  common::Monitor monitor;
  FeatureInteractionConstraintDevice interaction_constraints;

  std::unique_ptr<GradientBasedSampler> sampler;

  std::unique_ptr<FeatureGroups> feature_groups;
+  common::Monitor monitor;

  GPUHistMakerDevice(Context const* ctx, TrainParam _param, HistMakerTrainParam const* hist_param,
                     std::shared_ptr<common::ColumnSampler> column_sampler, BatchParam batch_param,
                     MetaInfo const& info, std::vector<bst_idx_t> batch_ptr,
                     std::shared_ptr<common::HistogramCuts const> cuts)
      : evaluator_{_param, static_cast<bst_feature_t>(info.num_col_), ctx->Device()},
-        ctx_(ctx),
-        param(std::move(_param)),
-        column_sampler_(std::move(column_sampler)),
-        interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
+        ctx_{ctx},
+        column_sampler_{std::move(column_sampler)},
        batch_ptr_{std::move(batch_ptr)},
        hist_param_{hist_param},
-        cuts_{std::move(cuts)} {
-    this->sampler =
-        std::make_unique<GradientBasedSampler>(ctx, info.num_row_, batch_param, param.subsample,
-                                               param.sampling_method, batch_ptr_.size() > 2);
+        cuts_{std::move(cuts)},
+        param{std::move(_param)},
+        interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
+        sampler{std::make_unique<GradientBasedSampler>(
+            ctx, info.num_row_, batch_param, param.subsample, param.sampling_method,
+            batch_ptr_.size() > 2 && this->hist_param_->extmem_concat_pages)} {
    if (!param.monotone_constraints.empty()) {
      // Copy assigning an empty vector causes an exception in MSVC debug builds
      monotone_constraints = param.monotone_constraints;
@@ -185,33 +182,31 @@ struct GPUHistMakerDevice {
  }

  // Reset values for each update iteration
-  [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* p_fmat) {
+  [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
    this->monitor.Start(__func__);
    common::SetDevice(ctx_->Ordinal());

    auto const& info = p_fmat->Info();
-    // backup the gradient
-    dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
-    this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(),
-                                param.colsample_bynode, param.colsample_bylevel,
-                                param.colsample_bytree);
-    this->interaction_constraints.Reset(ctx_);
-    this->evaluator_.Reset(this->ctx_, *cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(),
-                           p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit());

-    // Sampling
+    /**
+     * Sampling
+     */
+    dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
    auto sample = this->sampler->Sample(ctx_, dh::ToSpan(d_gpair), p_fmat);
    this->gpair = sample.gpair;
-    p_fmat = sample.p_fmat;  // Update p_fmat before allocating partitioners
+    p_fmat = sample.p_fmat;
    p_fmat->Info().feature_types.SetDevice(ctx_->Device());
-    std::size_t n_batches = p_fmat->NumBatches();
-    bool is_concat = (n_batches + 1) != this->batch_ptr_.size();
-    std::vector<bst_idx_t> batch_ptr{batch_ptr_};
+
+    /**
+     * Initialize the partitioners
+     */
+    bool is_concat = sampler->ConcatPages();
+    std::size_t n_batches = is_concat ? 1 : p_fmat->NumBatches();
+    std::vector<bst_idx_t> batch_ptr{this->batch_ptr_};
    if (is_concat) {
      // Concatenate the batch ptrs as well.
      batch_ptr = {static_cast<bst_idx_t>(0), p_fmat->Info().num_row_};
    }
-    // Initialize partitions
    if (!partitioners_.empty()) {
      CHECK_EQ(partitioners_.size(), n_batches);
    }
@@ -230,8 +225,20 @@ struct GPUHistMakerDevice {
      CHECK_EQ(partitioners_.front()->Size(), p_fmat->Info().num_row_);
    }

-    // Other initializations
-    quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
+    /**
+     * Initialize the evaluator
+     */
+    this->column_sampler_->Init(ctx_, info.num_col_, info.feature_weights.HostVector(),
+                                param.colsample_bynode, param.colsample_bylevel,
+                                param.colsample_bytree);
+    this->interaction_constraints.Reset(ctx_);
+    this->evaluator_.Reset(this->ctx_, *cuts_, info.feature_types.ConstDeviceSpan(), info.num_col_,
+                           this->param, info.IsColumnSplit());
+
+    /**
+     * Other initializations
+     */
+    this->quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());

    this->InitFeatureGroupsOnce(info);

@@ -327,8 +334,8 @@ struct GPUHistMakerDevice {

    auto d_ridx = partitioners_.at(k)->GetRows(nidx);
    this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
-                                    feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
-                                    d_node_hist, *quantiser);
+                                    feature_groups->DeviceAccessor(ctx_->Device()), this->gpair,
+                                    d_ridx, d_node_hist, *quantiser);
    monitor.Stop(__func__);
  }

@@ -678,11 +685,11 @@ struct GPUHistMakerDevice {
    constexpr bst_node_t kRootNIdx = RegTree::kRoot;
    auto quantiser = *this->quantiser;
    auto gpair_it = dh::MakeTransformIterator<GradientPairInt64>(
-        dh::tbegin(gpair),
+        dh::tbegin(this->gpair),
        [=] __device__(auto const& gpair) { return quantiser.ToFixedPoint(gpair); });
    GradientPairInt64 root_sum_quantised =
-        dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(), GradientPairInt64{},
-                   thrust::plus<GradientPairInt64>{});
+        dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + this->gpair.size(),
+                   GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
    using ReduceT = typename decltype(root_sum_quantised)::ValueT;
    auto rc = collective::GlobalSum(
        ctx_, p_fmat->Info(), linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -539,6 +539,7 @@ class QuantileHistMaker : public TreeUpdater {
        // Copy gradient into buffer for sampling. This converts C-order to F-order.
        std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
      }
+      error::NoPageConcat(this->hist_param_.extmem_concat_pages);
      SampleGradient(ctx_, *param, h_sample_out);
      auto *h_out_position = &out_position[tree_it - trees.begin()];
      if ((*tree_it)->IsMultiTarget()) {