[EM] Support mmap backed ellpack. (#10602)

- Support resource view in ellpack. - Define the CUDA version of MMAP resource. - Define the CUDA version of malloc resource. - Refactor cuda runtime API wrappers, and add memory access related wrappers. - gather windows macros into a single header.
2024-07-18 08:20:21 +08:00
parent e9fbce9791
commit 292bb677e5
59 changed files with 889 additions and 646 deletions
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -11,8 +11,9 @@
 #include "../common/categorical.h"
 #include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
-#include "../common/transform_iterator.h"  // MakeIndexTransformIter
-#include "device_adapter.cuh"              // for NoInfInData
+#include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
+#include "../common/transform_iterator.h"   // MakeIndexTransformIter
+#include "device_adapter.cuh"               // for NoInfInData
 #include "ellpack_page.cuh"
 #include "ellpack_page.h"
 #include "gradient_index.h"
@@ -43,21 +44,19 @@ __global__ void CompressBinEllpackKernel(
    common::CompressedBufferWriter wr,
    common::CompressedByteT* __restrict__ buffer,  // gidx_buffer
    const size_t* __restrict__ row_ptrs,           // row offset of input data
-    const Entry* __restrict__ entries,      // One batch of input data
-    const float* __restrict__ cuts,         // HistogramCuts::cut_values_
-    const uint32_t* __restrict__ cut_ptrs,  // HistogramCuts::cut_ptrs_
+    const Entry* __restrict__ entries,             // One batch of input data
+    const float* __restrict__ cuts,                // HistogramCuts::cut_values_
+    const uint32_t* __restrict__ cut_ptrs,         // HistogramCuts::cut_ptrs_
    common::Span<FeatureType const> feature_types,
-    size_t base_row,                        // batch_row_begin
-    size_t n_rows,
-    size_t row_stride,
-    unsigned int null_gidx_value) {
+    size_t base_row,  // batch_row_begin
+    size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) {
  size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
  int ifeature = threadIdx.y + blockIdx.y * blockDim.y;
  if (irow >= n_rows || ifeature >= row_stride) {
    return;
  }
  int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
-  unsigned int bin = null_gidx_value;
+  std::uint32_t bin = null_gidx_value;
  if (ifeature < row_length) {
    Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature];
    int feature = entry.index;
@@ -89,25 +88,23 @@ __global__ void CompressBinEllpackKernel(
 }

 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                 std::shared_ptr<common::HistogramCuts const> cuts, bool is_dense,
                                 bst_idx_t row_stride, bst_idx_t n_rows)
    : is_dense(is_dense), cuts_(std::move(cuts)), row_stride{row_stride}, n_rows{n_rows} {
  monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));

-  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(device);
-  monitor_.Stop("InitCompressedData");
+  this->InitCompressedData(ctx);
 }

-EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                 std::shared_ptr<common::HistogramCuts const> cuts,
                                 const SparsePage& page, bool is_dense, size_t row_stride,
                                 common::Span<FeatureType const> feature_types)
    : cuts_(std::move(cuts)), is_dense(is_dense), n_rows(page.Size()), row_stride(row_stride) {
-  this->InitCompressedData(device);
-  this->CreateHistIndices(device, page, feature_types);
+  this->InitCompressedData(ctx);
+  this->CreateHistIndices(ctx->Device(), page, feature_types);
 }

 // Construct an ELLPACK matrix in memory.
@@ -129,9 +126,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
  }
  monitor_.Stop("Quantiles");

-  monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->Device());
-  monitor_.Stop("InitCompressedData");
+  this->InitCompressedData(ctx);

  dmat->Info().feature_types.SetDevice(ctx->Device());
  auto ft = dmat->Info().feature_types.ConstDeviceSpan();
@@ -234,7 +229,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons

  auto device_accessor = dst->GetDeviceAccessor(device);
  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
-  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
+  auto d_compressed_buffer = dst->gidx_buffer.data();

  // We redirect the scan output into this functor to do the actual writing
  WriteCompressedEllpackFunctor<AdapterBatchT> functor(
@@ -275,7 +270,7 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t
  // Write the null values
  auto device_accessor = dst->GetDeviceAccessor(device);
  common::CompressedBufferWriter writer(device_accessor.NumSymbols());
-  auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
+  auto d_compressed_buffer = dst->gidx_buffer.data();
  auto row_stride = dst->row_stride;
  dh::LaunchN(row_stride * dst->n_rows, [=] __device__(size_t idx) {
    // For some reason this variable got captured as const
@@ -290,20 +285,20 @@ void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t
 }

 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
-                                 common::Span<size_t> row_counts_span,
+EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing,
+                                 bool is_dense, common::Span<size_t> row_counts_span,
                                 common::Span<FeatureType const> feature_types, size_t row_stride,
                                 size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));

-  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
-  CopyDataToEllpack(batch, feature_types, this, device, missing);
-  WriteNullValues(this, device, row_counts_span);
+  *this = EllpackPageImpl(ctx, cuts, is_dense, row_stride, n_rows);
+  CopyDataToEllpack(batch, feature_types, this, ctx->Device(), missing);
+  WriteNullValues(this, ctx->Device(), row_counts_span);
 }

 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
  template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
+      Context const* ctx, __BATCH_T batch, float missing, bool is_dense,                   \
      common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
      size_t row_stride, size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);

@@ -365,12 +360,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
  row_stride = *std::max_element(it, it + page.Size());

  CHECK(ctx->IsCUDA());
-  monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->Device());
-  monitor_.Stop("InitCompressedData");
+  InitCompressedData(ctx);

  // copy gidx
-  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
+  common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
  dh::device_vector<size_t> row_ptr(page.row_ptr.size());
  auto d_row_ptr = dh::ToSpan(row_ptr);
  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
@@ -389,20 +382,20 @@ struct CopyPage {
  // The number of elements to skip.
  size_t offset;

-  CopyPage(EllpackPageImpl *dst, EllpackPageImpl const *src, size_t offset)
-      : cbw{dst->NumSymbols()}, dst_data_d{dst->gidx_buffer.DevicePointer()},
-        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
+  CopyPage(EllpackPageImpl* dst, EllpackPageImpl const* src, size_t offset)
+      : cbw{dst->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.data()},
+        src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
        offset(offset) {}

  __device__ void operator()(size_t element_id) {
-    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
-                          element_id + offset);
+    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
  }
 };

 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
-  monitor_.Start("Copy");
+size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) {
+  monitor_.Start(__func__);
  bst_idx_t num_elements = page->n_rows * page->row_stride;
  CHECK_EQ(row_stride, page->row_stride);
  CHECK_EQ(NumSymbols(), page->NumSymbols());
@@ -411,10 +404,8 @@ size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size
    LOG(FATAL) << "Concatenating the same Ellpack.";
    return this->n_rows * this->row_stride;
  }
-  gidx_buffer.SetDevice(device);
-  page->gidx_buffer.SetDevice(device);
-  dh::LaunchN(num_elements, CopyPage(this, page, offset));
-  monitor_.Stop("Copy");
+  dh::LaunchN(num_elements, CopyPage{this, page, offset});
+  monitor_.Stop(__func__);
  return num_elements;
 }

@@ -423,8 +414,8 @@ struct CompactPage {
  common::CompressedBufferWriter cbw;
  common::CompressedByteT* dst_data_d;
  common::CompressedIterator<uint32_t> src_iterator_d;
-  /*! \brief An array that maps the rows from the full DMatrix to the compacted
-   * page.
+  /**
+   * @brief An array that maps the rows from the full DMatrix to the compacted page.
   *
   * The total size is the number of rows in the original, uncompacted DMatrix.
   * Elements are the row ids in the compacted page. Rows not needed are set to
@@ -438,24 +429,24 @@ struct CompactPage {
  size_t base_rowid;
  size_t row_stride;

-  CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src,
-              common::Span<size_t> row_indexes)
+  CompactPage(EllpackPageImpl* dst, EllpackPageImpl const* src, common::Span<size_t> row_indexes)
      : cbw{dst->NumSymbols()},
-        dst_data_d{dst->gidx_buffer.DevicePointer()},
-        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.data()},
+        src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
        row_indexes(row_indexes),
        base_rowid{src->base_rowid},
        row_stride{src->row_stride} {}

-  __device__ void operator()(size_t row_id) {
+  __device__ void operator()(bst_idx_t row_id) {
    size_t src_row = base_rowid + row_id;
    size_t dst_row = row_indexes[src_row];
-    if (dst_row == SIZE_MAX) return;
+    if (dst_row == SIZE_MAX) {
+      return;
+    }
    size_t dst_offset = dst_row * row_stride;
    size_t src_offset = row_id * row_stride;
    for (size_t j = 0; j < row_stride; j++) {
-      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
-                            dst_offset + j);
+      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
    }
  }
 };
@@ -467,28 +458,22 @@ void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
  CHECK_EQ(row_stride, page->row_stride);
  CHECK_EQ(NumSymbols(), page->NumSymbols());
  CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
-  gidx_buffer.SetDevice(ctx->Device());
-  page->gidx_buffer.SetDevice(ctx->Device());
  auto cuctx = ctx->CUDACtx();
-  dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage(this, page, row_indexes));
+  dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes});
  monitor_.Stop(__func__);
 }

 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
-  size_t num_symbols = NumSymbols();
+void EllpackPageImpl::InitCompressedData(Context const* ctx) {
+  monitor_.Start(__func__);
+  auto num_symbols = NumSymbols();

  // Required buffer size for storing data matrix in ELLPack format.
-  size_t compressed_size_bytes =
+  std::size_t compressed_size_bytes =
      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
-  gidx_buffer.SetDevice(device);
-  // Don't call fill unnecessarily
-  if (gidx_buffer.Size() == 0) {
-    gidx_buffer.Resize(compressed_size_bytes, 0);
-  } else {
-    gidx_buffer.Resize(compressed_size_bytes, 0);
-    thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
-  }
+  auto init = static_cast<common::CompressedByteT>(0);
+  gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init);
+  monitor_.Stop(__func__);
 }

 // Compress a CSR page into ELLPACK.
@@ -496,7 +481,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                        const SparsePage& row_batch,
                                        common::Span<FeatureType const> feature_types) {
  if (row_batch.Size() == 0) return;
-  unsigned int null_gidx_value = NumSymbols() - 1;
+  std::uint32_t null_gidx_value = NumSymbols() - 1;

  const auto& offset_vec = row_batch.offset.ConstHostVector();

@@ -541,13 +526,11 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
    const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
                     common::DivRoundUp(row_stride, block3.y), 1);
    auto device_accessor = GetDeviceAccessor(device);
-    dh::LaunchKernel {grid3, block3}(
-        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
-        gidx_buffer.DevicePointer(), row_ptrs.data().get(),
-        entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
-        device_accessor.feature_segments.data(), feature_types,
-        batch_row_begin, batch_nrows, row_stride,
-        null_gidx_value);
+    dh::LaunchKernel{grid3, block3}(  // NOLINT
+        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()), gidx_buffer.data(),
+        row_ptrs.data().get(), entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
+        device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows,
+        row_stride, null_gidx_value);
  }
 }

@@ -566,26 +549,31 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,

 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
    DeviceOrd device, common::Span<FeatureType const> feature_types) const {
-  gidx_buffer.SetDevice(device);
  return {device,
          cuts_,
          is_dense,
          row_stride,
          base_rowid,
          n_rows,
-          common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
-                                               NumSymbols()),
+          common::CompressedIterator<uint32_t>(gidx_buffer.data(), NumSymbols()),
          feature_types};
 }
+
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
+    Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
    common::Span<FeatureType const> feature_types) const {
+  h_gidx_buffer->resize(gidx_buffer.size());
+  CHECK_EQ(h_gidx_buffer->size(), gidx_buffer.size());
+  CHECK_NE(gidx_buffer.size(), 0);
+  dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
+                                cudaMemcpyDefault, dh::DefaultStream()));
  return {DeviceOrd::CPU(),
          cuts_,
          is_dense,
          row_stride,
          base_rowid,
          n_rows,
-          common::CompressedIterator<uint32_t>(gidx_buffer.ConstHostPointer(), NumSymbols()),
+          common::CompressedIterator<uint32_t>(h_gidx_buffer->data(), NumSymbols()),
          feature_types};
 }
 }  // namespace xgboost
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -1,23 +1,25 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
 */
-
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
 #define XGBOOST_DATA_ELLPACK_PAGE_CUH_

 #include <thrust/binary_search.h>
-#include <xgboost/data.h>

 #include "../common/categorical.h"
 #include "../common/compressed_iterator.h"
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
+#include "../common/ref_resource_view.h"  // for RefResourceView
 #include "ellpack_page.h"
+#include "xgboost/data.h"

 namespace xgboost {
-/** \brief Struct for accessing and manipulating an ELLPACK matrix on the
- * device. Does not own underlying memory and may be trivially copied into
- * kernels.*/
+/**
+ * @brief Struct for accessing and manipulating an ELLPACK matrix on the device.
+ *
+ * Does not own underlying memory and may be trivially copied into kernels.
+ */
 struct EllpackDeviceAccessor {
  /*! \brief Whether or not if the matrix is dense. */
  bool is_dense;
@@ -128,31 +130,31 @@ class GHistIndexMatrix;

 class EllpackPageImpl {
 public:
-  /*!
-   * \brief Default constructor.
+  /**
+   * @brief Default constructor.
   *
   * This is used in the external memory case. An empty ELLPACK page is constructed with its content
   * set later by the reader.
   */
  EllpackPageImpl() = default;

-  /*!
-   * \brief Constructor from an existing EllpackInfo.
+  /**
+   * @brief Constructor from an existing EllpackInfo.
   *
-   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
-   * and the given number of rows.
+   * This is used in the sampling case. The ELLPACK page is constructed from an existing
+   * Ellpack page and the given number of rows.
   */
-  EllpackPageImpl(DeviceOrd device, std::shared_ptr<common::HistogramCuts const> cuts,
+  EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                  bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows);
-  /*!
-   * \brief Constructor used for external memory.
+  /**
+   * @brief Constructor used for external memory.
   */
-  EllpackPageImpl(DeviceOrd device, std::shared_ptr<common::HistogramCuts const> cuts,
+  EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                  const SparsePage& page, bool is_dense, size_t row_stride,
                  common::Span<FeatureType const> feature_types);

-  /*!
-   * \brief Constructor from an existing DMatrix.
+  /**
+   * @brief Constructor from an existing DMatrix.
   *
   * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
   * in CSR format.
@@ -160,37 +162,39 @@ class EllpackPageImpl {
  explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);

  template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
+  explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
                           common::Span<size_t> row_counts_span,
                           common::Span<FeatureType const> feature_types, size_t row_stride,
                           size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
  /**
-   * \brief Constructor from an existing CPU gradient index.
+   * @brief Constructor from an existing CPU gradient index.
   */
  explicit EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page,
                           common::Span<FeatureType const> ft);

-  /*! \brief Copy the elements of the given ELLPACK page into this page.
+  /**
+   * @brief Copy the elements of the given ELLPACK page into this page.
   *
-   * @param device The GPU device to use.
+   * @param ctx The GPU context.
   * @param page The ELLPACK page to copy from.
   * @param offset The number of elements to skip before copying.
   * @returns The number of elements copied.
   */
-  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
+  bst_idx_t Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset);

-  /*! \brief Compact the given ELLPACK page into the current page.
+  /**
+   * @brief Compact the given ELLPACK page into the current page.
   *
-   * @param context The GPU context.
+   * @param ctx The GPU context.
   * @param page The ELLPACK page to compact from.
   * @param row_indexes Row indexes for the compacted page.
   */
  void Compact(Context const* ctx, EllpackPageImpl const* page, common::Span<size_t> row_indexes);

-  /*! \return Number of instances in the page. */
+  /** @return Number of instances in the page. */
  [[nodiscard]] bst_idx_t Size() const;

-  /*! \brief Set the base row id for this page. */
+  /** @brief Set the base row id for this page. */
  void SetBaseRowId(std::size_t row_id) {
    base_rowid = row_id;
  }
@@ -199,43 +203,54 @@ class EllpackPageImpl {
  [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
  void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) { cuts_ = cuts; }

-  /*! \return Estimation of memory cost of this page. */
+  /** @return Estimation of memory cost of this page. */
  static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;


-  /*! \brief Return the total number of symbols (total number of bins plus 1 for
-   * not found). */
+  /**
+   * @brief Return the total number of symbols (total number of bins plus 1 for not
+   *        found).
+   */
  [[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; }
-
+  /**
+   * @brief Get an accessor that can be passed into CUDA kernels.
+   */
  [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+  /**
+   * @brief Get an accessor for host code.
+   */
  [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+      Context const* ctx, std::vector<common::CompressedByteT>* h_gidx_buffer,
      common::Span<FeatureType const> feature_types = {}) const;

 private:
-  /*!
-   * \brief Compress a single page of CSR data into ELLPACK.
+  /**
+   * @brief Compress a single page of CSR data into ELLPACK.
   *
   * @param device The GPU device to use.
   * @param row_batch The CSR page.
   */
-  void CreateHistIndices(DeviceOrd device,
-                         const SparsePage& row_batch,
+  void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch,
                         common::Span<FeatureType const> feature_types);
-  /*!
-   * \brief Initialize the buffer to store compressed features.
+  /**
+   * @brief Initialize the buffer to store compressed features.
   */
-  void InitCompressedData(DeviceOrd device);
+  void InitCompressedData(Context const* ctx);

 public:
-  /*! \brief Whether or not if the matrix is dense. */
+  /** @brief Whether or not if the matrix is dense. */
  bool is_dense;
-  /*! \brief Row length for ELLPACK. */
+  /** @brief Row length for ELLPACK. */
  bst_idx_t row_stride;
  bst_idx_t base_rowid{0};
-  bst_idx_t n_rows{};
-  /*! \brief global index of histogram, which is stored in ELLPACK format. */
-  HostDeviceVector<common::CompressedByteT> gidx_buffer;
+  bst_idx_t n_rows{0};
+  /**
+   * @brief Index of the gradient histogram, which is stored in ELLPACK format.
+   *
+   * This can be backed by various storage types.
+   */
+  common::RefResourceView<common::CompressedByteT> gidx_buffer;

 private:
  std::shared_ptr<common::HistogramCuts const> cuts_;
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -4,11 +4,12 @@
 #include <dmlc/registry.h>

 #include <cstddef>  // for size_t
-#include <cstdint>  // for uint64_t
+#include <vector>   // for vector

-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
-#include "ellpack_page.cuh"               // for EllpackPage
+#include "../common/io.h"                   // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
+#include "../common/ref_resource_view.h"    // for ReadVec, WriteVec
+#include "ellpack_page.cuh"                 // for EllpackPage
 #include "ellpack_page_raw_format.h"
 #include "ellpack_page_source.h"

@@ -16,8 +17,10 @@ namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);

 namespace {
+// Function to support system without HMM or ATS
 template <typename T>
-[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, HostDeviceVector<T>* vec) {
+[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
+                                 common::RefResourceView<T>* vec) {
  std::uint64_t n{0};
  if (!fi->Read(&n)) {
    return false;
@@ -33,34 +36,34 @@ template <typename T>
    return false;
  }

-  vec->Resize(n);
-  auto d_vec = vec->DeviceSpan();
-  dh::safe_cuda(
-      cudaMemcpyAsync(d_vec.data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
+  auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  *vec = common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<T>(0));
+  dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
  return true;
 }
 }  // namespace

+#define RET_IF_NOT(expr) \
+  if (!(expr)) {         \
+    return false;        \
+  }
+
 [[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page,
                                              common::AlignedResourceReadStream* fi) {
  auto* impl = page->Impl();
+
  impl->SetCuts(this->cuts_);
-  if (!fi->Read(&impl->n_rows)) {
-    return false;
-  }
-  if (!fi->Read(&impl->is_dense)) {
-    return false;
-  }
-  if (!fi->Read(&impl->row_stride)) {
-    return false;
-  }
-  impl->gidx_buffer.SetDevice(device_);
-  if (!ReadDeviceVec(fi, &impl->gidx_buffer)) {
-    return false;
-  }
-  if (!fi->Read(&impl->base_rowid)) {
-    return false;
+  RET_IF_NOT(fi->Read(&impl->n_rows));
+  RET_IF_NOT(fi->Read(&impl->is_dense));
+  RET_IF_NOT(fi->Read(&impl->row_stride));
+
+  if (has_hmm_ats_) {
+    RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
+  } else {
+    RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
  }
+  RET_IF_NOT(fi->Read(&impl->base_rowid));
+  dh::DefaultStream().Sync();
  return true;
 }

@@ -71,8 +74,10 @@ template <typename T>
  bytes += fo->Write(impl->n_rows);
  bytes += fo->Write(impl->is_dense);
  bytes += fo->Write(impl->row_stride);
-  CHECK(!impl->gidx_buffer.ConstHostVector().empty());
-  bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
+  bytes += common::WriteVec(fo, h_gidx_buffer);
  bytes += fo->Write(impl->base_rowid);
  dh::DefaultStream().Sync();
  return bytes;
@@ -82,33 +87,20 @@ template <typename T>
  auto* impl = page->Impl();
  CHECK(this->cuts_->cut_values_.DeviceCanRead());
  impl->SetCuts(this->cuts_);
-  if (!fi->Read(&impl->n_rows)) {
-    return false;
-  }
-  if (!fi->Read(&impl->is_dense)) {
-    return false;
-  }
-  if (!fi->Read(&impl->row_stride)) {
-    return false;
-  }
+  RET_IF_NOT(fi->Read(&impl->n_rows));
+  RET_IF_NOT(fi->Read(&impl->is_dense));
+  RET_IF_NOT(fi->Read(&impl->row_stride));

  // Read vec
+  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
  bst_idx_t n{0};
-  if (!fi->Read(&n)) {
-    return false;
-  }
+  RET_IF_NOT(fi->Read(&n));
  if (n != 0) {
-    impl->gidx_buffer.SetDevice(device_);
-    impl->gidx_buffer.Resize(n);
-    auto span = impl->gidx_buffer.DeviceSpan();
-    if (!fi->Read(span.data(), span.size_bytes())) {
-      return false;
-    }
-  }
-
-  if (!fi->Read(&impl->base_rowid)) {
-    return false;
+    impl->gidx_buffer =
+        common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<common::CompressedByteT>(0));
+    RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
  }
+  RET_IF_NOT(fi->Read(&impl->base_rowid));

  dh::DefaultStream().Sync();
  return true;
@@ -123,16 +115,17 @@ template <typename T>
  bytes += fo->Write(impl->row_stride);

  // Write vector
-  bst_idx_t n = impl->gidx_buffer.Size();
+  bst_idx_t n = impl->gidx_buffer.size();
  bytes += fo->Write(n);

-  if (!impl->gidx_buffer.Empty()) {
-    auto span = impl->gidx_buffer.ConstDeviceSpan();
-    bytes += fo->Write(span.data(), span.size_bytes());
+  if (!impl->gidx_buffer.empty()) {
+    bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
  }
  bytes += fo->Write(impl->base_rowid);

  dh::DefaultStream().Sync();
  return bytes;
 }
+
+#undef RET_IF_NOT
 }  // namespace xgboost::data
--- a/src/data/ellpack_page_raw_format.h
+++ b/src/data/ellpack_page_raw_format.h
@@ -26,10 +26,13 @@ class EllpackHostCacheStream;
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
  std::shared_ptr<common::HistogramCuts const> cuts_;
  DeviceOrd device_;
+  // Supports CUDA HMM or ATS
+  bool has_hmm_ats_{false};

 public:
-  explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device)
-      : cuts_{std::move(cuts)}, device_{device} {}
+  explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device,
+                                bool has_hmm_ats)
+      : cuts_{std::move(cuts)}, device_{device}, has_hmm_ats_{has_hmm_ats} {}
  [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override;
  [[nodiscard]] std::size_t Write(const EllpackPage& page,
                                  common::AlignedFileWriteStream* fo) override;
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -11,6 +11,7 @@
 #include "../common/common.h"                 // for safe_cuda
 #include "../common/cuda_pinned_allocator.h"  // for pinned_allocator
 #include "../common/device_helpers.cuh"       // for CUDAStreamView, DefaultStream
+#include "../common/resource.cuh"             // for PrivateCudaMmapConstStream
 #include "ellpack_page.cuh"                   // for EllpackPageImpl
 #include "ellpack_page.h"                     // for EllpackPage
 #include "ellpack_page_source.h"
@@ -86,16 +87,16 @@ void EllpackHostCacheStream::Seek(bst_idx_t offset_bytes) { this->p_impl_->Seek(
 void EllpackHostCacheStream::Bound(bst_idx_t offset_bytes) { this->p_impl_->Bound(offset_bytes); }

 /**
- * EllpackFormatType
+ * EllpackCacheStreamPolicy
 */

 template <typename S, template <typename> typename F>
-EllpackFormatStreamPolicy<S, F>::EllpackFormatStreamPolicy()
+EllpackCacheStreamPolicy<S, F>::EllpackCacheStreamPolicy()
    : p_cache_{std::make_shared<EllpackHostCache>()} {}

 template <typename S, template <typename> typename F>
-[[nodiscard]] std::unique_ptr<typename EllpackFormatStreamPolicy<S, F>::WriterT>
-EllpackFormatStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
+[[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::WriterT>
+EllpackCacheStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
  auto fo = std::make_unique<EllpackHostCacheStream>(this->p_cache_);
  if (iter == 0) {
    CHECK(this->p_cache_->cache.empty());
@@ -106,9 +107,8 @@ EllpackFormatStreamPolicy<S, F>::CreateWriter(StringView, std::uint32_t iter) {
 }

 template <typename S, template <typename> typename F>
-[[nodiscard]] std::unique_ptr<typename EllpackFormatStreamPolicy<S, F>::ReaderT>
-EllpackFormatStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset,
-                                              bst_idx_t length) const {
+[[nodiscard]] std::unique_ptr<typename EllpackCacheStreamPolicy<S, F>::ReaderT>
+EllpackCacheStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset, bst_idx_t length) const {
  auto fi = std::make_unique<ReaderT>(this->p_cache_);
  fi->Seek(offset);
  fi->Bound(offset + length);
@@ -117,18 +117,40 @@ EllpackFormatStreamPolicy<S, F>::CreateReader(StringView, bst_idx_t offset,
 }

 // Instantiation
-template EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::EllpackFormatStreamPolicy();
+template EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::EllpackCacheStreamPolicy();

 template std::unique_ptr<
-    typename EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::WriterT>
-EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringView name,
-                                                                          std::uint32_t iter);
+    typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::WriterT>
+EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateWriter(StringView name,
+                                                                         std::uint32_t iter);

 template std::unique_ptr<
-    typename EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
-EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
+    typename EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
+EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(
    StringView name, std::uint64_t offset, std::uint64_t length) const;

+/**
+ * EllpackMmapStreamPolicy
+ */
+
+template <typename S, template <typename> typename F>
+[[nodiscard]] std::unique_ptr<typename EllpackMmapStreamPolicy<S, F>::ReaderT>
+EllpackMmapStreamPolicy<S, F>::CreateReader(StringView name, bst_idx_t offset,
+                                            bst_idx_t length) const {
+  if (has_hmm_) {
+    return std::make_unique<common::PrivateCudaMmapConstStream>(name, offset, length);
+  } else {
+    return std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
+  }
+}
+
+// Instantiation
+template std::unique_ptr<
+    typename EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::ReaderT>
+EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringView name,
+                                                                        bst_idx_t offset,
+                                                                        bst_idx_t length) const;
+
 /**
 * EllpackPageSourceImpl
 */
@@ -146,8 +168,8 @@ void EllpackPageSourceImpl<F>::Fetch() {
    auto const& csr = this->source_->Page();
    this->page_.reset(new EllpackPage{});
    auto* impl = this->page_->Impl();
-    *impl = EllpackPageImpl{this->Device(), this->GetCuts(), *csr,
-                            is_dense_,      row_stride_,     feature_types_};
+    Context ctx = Context{}.MakeCUDA(this->Device().ordinal);
+    *impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
    this->page_->SetBaseRowId(csr->base_rowid);
    this->WriteCache();
  }
@@ -157,5 +179,7 @@ void EllpackPageSourceImpl<F>::Fetch() {
 template void
 EllpackPageSourceImpl<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
 template void
-EllpackPageSourceImpl<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
+EllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
+template void
+EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>::Fetch();
 }  // namespace xgboost::data
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -9,6 +9,7 @@
 #include <memory>   // for shared_ptr
 #include <utility>  // for move

+#include "../common/cuda_rt_utils.h"  // for SupportsPageableMem
 #include "../common/hist_util.h"      // for HistogramCuts
 #include "ellpack_page.h"             // for EllpackPage
 #include "ellpack_page_raw_format.h"  // for EllpackPageRawFormat
@@ -59,14 +60,19 @@ template <typename S>
 class EllpackFormatPolicy {
  std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
  DeviceOrd device_;
+  bool has_hmm_{common::SupportsPageableMem()};

 public:
  using FormatT = EllpackPageRawFormat;

 public:
+  EllpackFormatPolicy() = default;
+  // For testing with the HMM flag.
+  explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
+
  [[nodiscard]] auto CreatePageFormat() const {
    CHECK_EQ(cuts_->cut_values_.Device(), device_);
-    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_}};
+    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_, has_hmm_}};
    return fmt;
  }

@@ -83,7 +89,7 @@ class EllpackFormatPolicy {
 };

 template <typename S, template <typename> typename F>
-class EllpackFormatStreamPolicy : public F<S> {
+class EllpackCacheStreamPolicy : public F<S> {
  std::shared_ptr<EllpackHostCache> p_cache_;

 public:
@@ -91,13 +97,42 @@ class EllpackFormatStreamPolicy : public F<S> {
  using ReaderT = EllpackHostCacheStream;

 public:
-  EllpackFormatStreamPolicy();
+  EllpackCacheStreamPolicy();
  [[nodiscard]] std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter);

  [[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
                                                      bst_idx_t length) const;
 };

+template <typename S, template <typename> typename F>
+class EllpackMmapStreamPolicy : public F<S> {
+  bool has_hmm_{common::SupportsPageableMem()};
+
+ public:
+  using WriterT = common::AlignedFileWriteStream;
+  using ReaderT = common::AlignedResourceReadStream;
+
+ public:
+  EllpackMmapStreamPolicy() = default;
+  // For testing with the HMM flag.
+  template <
+      typename std::enable_if_t<std::is_same_v<F<S>, EllpackFormatPolicy<EllpackPage>>>* = nullptr>
+  explicit EllpackMmapStreamPolicy(bool has_hmm) : F<S>{has_hmm}, has_hmm_{has_hmm} {}
+
+  [[nodiscard]] std::unique_ptr<WriterT> CreateWriter(StringView name, std::uint32_t iter) {
+    std::unique_ptr<common::AlignedFileWriteStream> fo;
+    if (iter == 0) {
+      fo = std::make_unique<common::AlignedFileWriteStream>(name, "wb");
+    } else {
+      fo = std::make_unique<common::AlignedFileWriteStream>(name, "ab");
+    }
+    return fo;
+  }
+
+  [[nodiscard]] std::unique_ptr<ReaderT> CreateReader(StringView name, bst_idx_t offset,
+                                                      bst_idx_t length) const;
+};
+
 template <typename F>
 class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
  using Super = PageSourceIncMixIn<EllpackPage, F>;
@@ -128,11 +163,11 @@ class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {

 // Cache to host
 using EllpackPageHostSource =
-    EllpackPageSourceImpl<EllpackFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
+    EllpackPageSourceImpl<EllpackCacheStreamPolicy<EllpackPage, EllpackFormatPolicy>>;

 // Cache to disk
 using EllpackPageSource =
-    EllpackPageSourceImpl<DefaultFormatStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
+    EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;

 #if !defined(XGBOOST_USE_CUDA)
 template <typename F>
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -16,7 +16,8 @@ template <typename BinT, typename CompressOffset>
 void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
                  std::vector<size_t>* p_hit_count_tloc, CompressOffset&& get_offset,
                  GHistIndexMatrix* out) {
-  auto accessor = page->GetHostAccessor();
+  std::vector<common::CompressedByteT> h_gidx_buffer;
+  auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
  auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());

  common::Span<BinT> index_data_span = {out->index.data<BinT>(), out->index.Size()};
@@ -47,7 +48,8 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
  if (page->is_dense) {
    std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
  } else {
-    auto accessor = page->GetHostAccessor();
+    std::vector<common::CompressedByteT> h_gidx_buffer;
+    auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
    auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());

    common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) {
--- a/src/data/histogram_cut_format.h
+++ b/src/data/histogram_cut_format.h
@@ -1,49 +0,0 @@
-/**
- * Copyright 2021-2024, XGBoost contributors
- */
-#ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
-#define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
-
-#include <dmlc/io.h>  // for Stream
-
-#include <cstddef>  // for size_t
-
-#include "../common/hist_util.h"          // for HistogramCuts
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for WriteVec, ReadVec
-
-namespace xgboost::data {
-inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
-  if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
-    return false;
-  }
-  bool has_cat{false};
-  if (!fi->Read(&has_cat)) {
-    return false;
-  }
-  decltype(cuts->MaxCategory()) max_cat{0};
-  if (!fi->Read(&max_cat)) {
-    return false;
-  }
-  cuts->SetCategorical(has_cat, max_cat);
-  return true;
-}
-
-inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
-                                      common::AlignedFileWriteStream *fo) {
-  std::size_t bytes = 0;
-  bytes += common::WriteVec(fo, cuts.Values());
-  bytes += common::WriteVec(fo, cuts.Ptrs());
-  bytes += common::WriteVec(fo, cuts.MinValues());
-  bytes += fo->Write(cuts.HasCategorical());
-  bytes += fo->Write(cuts.MaxCategory());
-  return bytes;
-}
-}  // namespace xgboost::data
-#endif  // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -5,6 +5,7 @@
 #include <memory>

 #include "../collective/allreduce.h"
+#include "../common/cuda_rt_utils.h"  // for AllVisibleGPUs
 #include "../common/hist_util.cuh"
 #include "batch_utils.h"  // for RegenGHist
 #include "device_adapter.cuh"
@@ -45,11 +46,17 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,

  int32_t current_device;
  dh::safe_cuda(cudaGetDevice(&current_device));
+  auto get_ctx = [&]() {
+    Context d_ctx = (ctx->IsCUDA()) ? *ctx : Context{}.MakeCUDA(current_device);
+    CHECK(!d_ctx.IsCPU());
+    return d_ctx;
+  };
  auto get_device = [&]() {
    auto d = (ctx->IsCUDA()) ? ctx->Device() : DeviceOrd::CUDA(current_device);
    CHECK(!d.IsCPU());
    return d;
  };
+  fmat_ctx_ = get_ctx();

  /**
   * Generate quantiles
@@ -118,7 +125,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
      // that case device id is invalid.
      ellpack_.reset(new EllpackPage);
      *(ellpack_->Impl()) =
-          EllpackPageImpl(get_device(), cuts, this->IsDense(), row_stride, accumulated_rows);
+          EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), row_stride, accumulated_rows);
    }
  };

@@ -142,10 +149,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    proxy->Info().feature_types.SetDevice(get_device());
    auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
    auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
-      return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
-                             d_feature_types, row_stride, rows, cuts);
+      return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
+                             row_stride, rows, cuts);
    });
-    size_t num_elements = ellpack_->Impl()->Copy(get_device(), &new_impl, offset);
+    std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
    offset += num_elements;

    proxy->Info().num_row_ = num_rows();
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -226,7 +226,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
    }
    // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
    // to let user adjust number of pre-fetched batches when needed.
-    std::int32_t kPrefetches = 3;
+    std::int32_t constexpr kPrefetches = 3;
    std::int32_t n_prefetches = std::min(nthreads_, kPrefetches);
    n_prefetches = std::max(n_prefetches, 1);
    std::int32_t n_prefetch_batches = std::min(static_cast<bst_idx_t>(n_prefetches), n_batches_);