Partial rewrite EllpackPage (#5352)

2020-03-11 10:15:53 +13:00
parent 7a99f8f27f
commit 3ad4333b0e
23 changed files with 496 additions and 733 deletions
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -13,11 +13,24 @@ class EllpackPageImpl {};
 EllpackPage::EllpackPage() = default;

 EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) {
-  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
 }

 EllpackPage::~EllpackPage() {
-  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+}
+
+void EllpackPage::SetBaseRowId(size_t row_id) {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+}
+
+size_t EllpackPage::Size() const {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+  return 0;
 }

 }  // namespace xgboost
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -4,9 +4,9 @@

 #include <xgboost/data.h>

-#include "./ellpack_page.cuh"
 #include "../common/hist_util.h"
 #include "../common/random.h"
+#include "./ellpack_page.cuh"

 namespace xgboost {

@@ -17,13 +17,9 @@ EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param)

 EllpackPage::~EllpackPage() = default;

-size_t EllpackPage::Size() const {
-  return impl_->Size();
-}
+size_t EllpackPage::Size() const { return impl_->Size(); }

-void EllpackPage::SetBaseRowId(size_t row_id) {
-  impl_->SetBaseRowId(row_id);
-}
+void EllpackPage::SetBaseRowId(size_t row_id) { impl_->SetBaseRowId(row_id); }

 // Bin each input data entry, store the bin indices in compressed form.
 __global__ void CompressBinEllpackKernel(
@@ -65,16 +61,18 @@ __global__ void CompressBinEllpackKernel(
 }

 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, EllpackInfo info, size_t n_rows) {
+EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+                                 bool is_dense, size_t row_stride,
+                                 size_t n_rows)
+    : is_dense(is_dense),
+      cuts_(std::move(cuts)),
+      row_stride(row_stride),
+      n_rows(n_rows) {
  monitor_.Init("ellpack_page");
  dh::safe_cuda(cudaSetDevice(device));

-  matrix.info = info;
-  matrix.base_rowid = 0;
-  matrix.n_rows = n_rows;
-
  monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(device, n_rows);
+  InitCompressedData(device);
  monitor_.StopCuda("InitCompressedData");
 }

@@ -93,33 +91,27 @@ size_t GetRowStride(DMatrix* dmat) {
 }

 // Construct an ELLPACK matrix in memory.
-EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
+EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
+    : is_dense(dmat->IsDense()) {
  monitor_.Init("ellpack_page");
  dh::safe_cuda(cudaSetDevice(param.gpu_id));

-  matrix.n_rows = dmat->Info().num_row_;
+  n_rows = dmat->Info().num_row_;

  monitor_.StartCuda("Quantiles");
  // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
-  size_t row_stride = GetRowStride(dmat);
-  auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
+  row_stride = GetRowStride(dmat);
+  cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
                                   param.gpu_batch_nrows);
  monitor_.StopCuda("Quantiles");

-  monitor_.StartCuda("InitEllpackInfo");
-  InitInfo(param.gpu_id, dmat->IsDense(), row_stride, cuts);
-  monitor_.StopCuda("InitEllpackInfo");
-
  monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(param.gpu_id, dmat->Info().num_row_);
+  InitCompressedData(param.gpu_id);
  monitor_.StopCuda("InitCompressedData");

  monitor_.StartCuda("BinningCompression");
-  DeviceHistogramBuilderState hist_builder_row_state(dmat->Info().num_row_);
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    hist_builder_row_state.BeginBatch(batch);
-    CreateHistIndices(param.gpu_id, batch, hist_builder_row_state.GetRowStateOnDevice());
-    hist_builder_row_state.EndBatch();
+    CreateHistIndices(param.gpu_id, batch);
  }
  monitor_.StopCuda("BinningCompression");
 }
@@ -133,23 +125,26 @@ struct CopyPage {
  size_t offset;

  CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset)
-      : cbw{dst->matrix.info.NumSymbols()},
-        dst_data_d{dst->gidx_buffer.data()},
-        src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
+      : cbw{dst->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.DevicePointer()},
+        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
        offset(offset) {}

  __device__ void operator()(size_t element_id) {
-    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
+    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
+                          element_id + offset);
  }
 };

 // Copy the data from the given EllpackPage to the current page.
 size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
  monitor_.StartCuda("Copy");
-  size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
-  CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
-  CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
-  CHECK_GE(matrix.n_rows * matrix.info.row_stride, offset + num_elements);
+  size_t num_elements = page->n_rows * page->row_stride;
+  CHECK_EQ(row_stride, page->row_stride);
+  CHECK_EQ(NumSymbols(), page->NumSymbols());
+  CHECK_GE(n_rows * row_stride, offset + num_elements);
+  gidx_buffer.SetDevice(device);
+  page->gidx_buffer.SetDevice(device);
  dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
  monitor_.StopCuda("Copy");
  return num_elements;
@@ -160,26 +155,29 @@ struct CompactPage {
  common::CompressedBufferWriter cbw;
  common::CompressedByteT* dst_data_d;
  common::CompressedIterator<uint32_t> src_iterator_d;
-  /*! \brief An array that maps the rows from the full DMatrix to the compacted page.
+  /*! \brief An array that maps the rows from the full DMatrix to the compacted
+   * page.
   *
-   * The total size is the number of rows in the original, uncompacted DMatrix. Elements are the
-   * row ids in the compacted page. Rows not needed are set to SIZE_MAX.
+   * The total size is the number of rows in the original, uncompacted DMatrix.
+   * Elements are the row ids in the compacted page. Rows not needed are set to
+   * SIZE_MAX.
   *
   * An example compacting 16 rows to 8 rows:
-   * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6, SIZE_MAX, 7, SIZE_MAX,
-   * SIZE_MAX]
+   * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6,
+   * SIZE_MAX, 7, SIZE_MAX, SIZE_MAX]
   */
  common::Span<size_t> row_indexes;
  size_t base_rowid;
  size_t row_stride;

-  CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, common::Span<size_t> row_indexes)
-      : cbw{dst->matrix.info.NumSymbols()},
-        dst_data_d{dst->gidx_buffer.data()},
-        src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
+  CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src,
+              common::Span<size_t> row_indexes)
+      : cbw{dst->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.DevicePointer()},
+        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
        row_indexes(row_indexes),
-        base_rowid{src->matrix.base_rowid},
-        row_stride{src->matrix.info.row_stride} {}
+        base_rowid{src->base_rowid},
+        row_stride{src->row_stride} {}

  __device__ void operator()(size_t row_id) {
    size_t src_row = base_rowid + row_id;
@@ -188,100 +186,72 @@ struct CompactPage {
    size_t dst_offset = dst_row * row_stride;
    size_t src_offset = row_id * row_stride;
    for (size_t j = 0; j < row_stride; j++) {
-      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
+      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
+                            dst_offset + j);
    }
  }
 };

 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes) {
+void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
+                              common::Span<size_t> row_indexes) {
  monitor_.StartCuda("Compact");
-  CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
-  CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
-  CHECK_LE(page->matrix.base_rowid + page->matrix.n_rows, row_indexes.size());
-  dh::LaunchN(device, page->matrix.n_rows, CompactPage(this, page, row_indexes));
+  CHECK_EQ(row_stride, page->row_stride);
+  CHECK_EQ(NumSymbols(), page->NumSymbols());
+  CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
+  gidx_buffer.SetDevice(device);
+  page->gidx_buffer.SetDevice(device);
+  dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
  monitor_.StopCuda("Compact");
 }

-// Construct an EllpackInfo based on histogram cuts of features.
-EllpackInfo::EllpackInfo(int device,
-                         bool is_dense,
-                         size_t row_stride,
-                         const common::HistogramCuts& hmat,
-                         dh::BulkAllocator* ba)
-    : is_dense(is_dense), row_stride(row_stride), n_bins(hmat.Ptrs().back()) {
-
-  ba->Allocate(device,
-               &feature_segments, hmat.Ptrs().size(),
-               &gidx_fvalue_map, hmat.Values().size(),
-               &min_fvalue, hmat.MinValues().size());
-  dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
-  dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
-  dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
-}
-
-// Initialize the EllpackInfo for this page.
-void EllpackPageImpl::InitInfo(int device,
-                               bool is_dense,
-                               size_t row_stride,
-                               const common::HistogramCuts& hmat) {
-  matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, &ba_);
-}
-
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device, size_t num_rows) {
-  size_t num_symbols = matrix.info.NumSymbols();
+void EllpackPageImpl::InitCompressedData(int device) {
+  size_t num_symbols = NumSymbols();

  // Required buffer size for storing data matrix in ELLPack format.
-  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
-      matrix.info.row_stride * num_rows, num_symbols);
-  ba_.Allocate(device, &gidx_buffer, compressed_size_bytes);
-
-  thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
-
-  matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols);
+  size_t compressed_size_bytes =
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
+                                                          num_symbols);
+  gidx_buffer.SetDevice(device);
+  // Don't call fill unnecessarily
+  if (gidx_buffer.Size() == 0) {
+    gidx_buffer.Resize(compressed_size_bytes, 0);
+  } else {
+    gidx_buffer.Resize(compressed_size_bytes, 0);
+    thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
+  }
 }

 // Compress a CSR page into ELLPACK.
 void EllpackPageImpl::CreateHistIndices(int device,
-                                        const SparsePage& row_batch,
-                                        const RowStateOnDevice& device_row_state) {
-  // Has any been allocated for me in this batch?
-  if (!device_row_state.rows_to_process_from_batch) return;
-
-  unsigned int null_gidx_value = matrix.info.n_bins;
-  size_t row_stride = matrix.info.row_stride;
+                                        const SparsePage& row_batch) {
+  if (row_batch.Size() == 0) return;
+  unsigned int null_gidx_value = NumSymbols() - 1;

  const auto& offset_vec = row_batch.offset.ConstHostVector();

  // bin and compress entries in batches of rows
-  size_t gpu_batch_nrows = std::min(
-      dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
-      static_cast<size_t>(device_row_state.rows_to_process_from_batch));
+  size_t gpu_batch_nrows =
+      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+               static_cast<size_t>(row_batch.Size()));
  const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();

-  size_t gpu_nbatches = common::DivRoundUp(device_row_state.rows_to_process_from_batch,
-                                           gpu_batch_nrows);
+  size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);

  for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
    size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
-    size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
-    if (batch_row_end > device_row_state.rows_to_process_from_batch) {
-      batch_row_end = device_row_state.rows_to_process_from_batch;
-    }
+    size_t batch_row_end =
+        std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size());
    size_t batch_nrows = batch_row_end - batch_row_begin;

-    const auto ent_cnt_begin =
-        offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin];
-    const auto ent_cnt_end =
-        offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end];
+    const auto ent_cnt_begin = offset_vec[batch_row_begin];
+    const auto ent_cnt_end = offset_vec[batch_row_end];

    /*! \brief row offset in SparsePage (the input data). */
    dh::device_vector<size_t> row_ptrs(batch_nrows + 1);
-    thrust::copy(
-        offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin,
-        offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1,
-        row_ptrs.begin());
+    thrust::copy(offset_vec.data() + batch_row_begin,
+                 offset_vec.data() + batch_row_end + 1, row_ptrs.begin());

    // number of entries in this batch.
    size_t n_entries = ent_cnt_end - ent_cnt_begin;
@@ -289,97 +259,50 @@ void EllpackPageImpl::CreateHistIndices(int device,
    // copy data entries to device.
    dh::safe_cuda(cudaMemcpy(entries_d.data().get(),
                             data_vec.data() + ent_cnt_begin,
-                             n_entries * sizeof(Entry),
-                             cudaMemcpyDefault));
+                             n_entries * sizeof(Entry), cudaMemcpyDefault));
    const dim3 block3(32, 8, 1);  // 256 threads
    const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
-                     common::DivRoundUp(row_stride, block3.y),
-                     1);
-    dh::LaunchKernel {grid3, block3} (
-        CompressBinEllpackKernel,
-        common::CompressedBufferWriter(matrix.info.NumSymbols()),
-        gidx_buffer.data(),
-        row_ptrs.data().get(),
-        entries_d.data().get(),
-        matrix.info.gidx_fvalue_map.data(),
-        matrix.info.feature_segments.data(),
-        device_row_state.total_rows_processed + batch_row_begin,
-        batch_nrows,
-        row_stride,
+                     common::DivRoundUp(row_stride, block3.y), 1);
+    auto device_accessor = GetDeviceAccessor(device);
+    dh::LaunchKernel {grid3, block3}(
+        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
+        gidx_buffer.DevicePointer(), row_ptrs.data().get(),
+        entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
+        device_accessor.feature_segments.data(),
+        row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride,
        null_gidx_value);
  }
 }

 // Return the number of rows contained in this page.
-size_t EllpackPageImpl::Size() const {
-  return matrix.n_rows;
-}
-
-// Clear the current page.
-void EllpackPageImpl::Clear() {
-  ba_.Clear();
-  gidx_buffer = {};
-  idx_buffer.clear();
-  sparse_page_.Clear();
-  matrix.base_rowid = 0;
-  matrix.n_rows = 0;
-  device_initialized_ = false;
-}
-
-// Push a CSR page to the current page.
-//
-// The CSR pages are accumulated in memory until they reach a certain size, then written out as
-// compressed ELLPACK.
-void EllpackPageImpl::Push(int device, const SparsePage& batch) {
-  sparse_page_.Push(batch);
-  matrix.n_rows += batch.Size();
-}
-
-// Compress the accumulated SparsePage.
-void EllpackPageImpl::CompressSparsePage(int device) {
-  monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(device, matrix.n_rows);
-  monitor_.StopCuda("InitCompressedData");
-
-  monitor_.StartCuda("BinningCompression");
-  DeviceHistogramBuilderState hist_builder_row_state(matrix.n_rows);
-  hist_builder_row_state.BeginBatch(sparse_page_);
-  CreateHistIndices(device, sparse_page_, hist_builder_row_state.GetRowStateOnDevice());
-  hist_builder_row_state.EndBatch();
-  monitor_.StopCuda("BinningCompression");
-
-  monitor_.StartCuda("CopyDeviceToHost");
-  idx_buffer.resize(gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&idx_buffer, gidx_buffer);
-  ba_.Clear();
-  gidx_buffer = {};
-  monitor_.StopCuda("CopyDeviceToHost");
-}
+size_t EllpackPageImpl::Size() const { return n_rows; }

 // Return the memory cost for storing the compressed features.
-size_t EllpackPageImpl::MemCostBytes() const {
-  // Required buffer size for storing data matrix in ELLPack format.
-  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
-      matrix.info.row_stride * matrix.n_rows, matrix.info.NumSymbols());
+size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
+                                     const common::HistogramCuts& cuts) {
+  // Required buffer size for storing data matrix in EtoLLPack format.
+  size_t compressed_size_bytes =
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * num_rows,
+                                                          cuts.TotalBins() + 1);
  return compressed_size_bytes;
 }

-// Copy the compressed features to GPU.
-void EllpackPageImpl::InitDevice(int device, EllpackInfo info) {
-  if (device_initialized_) return;
+EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(int device) const {
+  gidx_buffer.SetDevice(device);
+  return EllpackDeviceAccessor(
+      device, cuts_, is_dense, row_stride, base_rowid, n_rows,
+      common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
+                                           NumSymbols()));
+}

-  monitor_.StartCuda("CopyPageToDevice");
-  dh::safe_cuda(cudaSetDevice(device));
-
-  gidx_buffer = {};
-  ba_.Allocate(device, &gidx_buffer, idx_buffer.size());
-  dh::CopyVectorToDeviceSpan(gidx_buffer, idx_buffer);
-
-  matrix.info = info;
-  matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), info.n_bins + 1);
-
-  monitor_.StopCuda("CopyPageToDevice");
-
-  device_initialized_ = true;
+EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+                                 const SparsePage& page, bool is_dense,
+                                 size_t row_stride)
+    : cuts_(std::move(cuts)),
+      is_dense(is_dense),
+      n_rows(page.Size()),
+      row_stride(row_stride) {
+  this->InitCompressedData(device);
+  this->CreateHistIndices(device, page);
 }
 }  // namespace xgboost
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -40,71 +40,53 @@ __forceinline__ __device__ int BinarySearchRow(
  return -1;
 }

-/** \brief Meta information about the ELLPACK matrix. */
-struct EllpackInfo {
+/** \brief Struct for accessing and manipulating an ellpack matrix on the
+ * device. Does not own underlying memory and may be trivially copied into
+ * kernels.*/
+struct EllpackDeviceAccessor {
  /*! \brief Whether or not if the matrix is dense. */
  bool is_dense;
  /*! \brief Row length for ELLPack, equal to number of features. */
  size_t row_stride;
-  /*! \brief Total number of bins, also used as the null index value, . */
-  size_t n_bins;
-  /*! \brief Minimum value for each feature. Size equals to number of features. */
-  common::Span<bst_float> min_fvalue;
-  /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
-  common::Span<uint32_t> feature_segments;
-  /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
-  common::Span<bst_float> gidx_fvalue_map;
-
-  EllpackInfo() = default;
-
-  /*!
-   * \brief Constructor.
-   *
-   * @param device The GPU device to use.
-   * @param is_dense Whether the matrix is dense.
-   * @param row_stride The number of features between starts of consecutive rows.
-   * @param hmat The histogram cuts of all the features.
-   * @param ba The BulkAllocator that owns the GPU memory.
-   */
-  explicit EllpackInfo(int device,
-                       bool is_dense,
-                       size_t row_stride,
-                       const common::HistogramCuts& hmat,
-                       dh::BulkAllocator* ba);
-
-  /*! \brief Return the total number of symbols (total number of bins plus 1 for not found). */
-  size_t NumSymbols() const {
-    return n_bins + 1;
-  }
-  size_t NumFeatures() const {
-    return min_fvalue.size();
-  }
-};
-
-/** \brief Struct for accessing and manipulating an ellpack matrix on the
- * device. Does not own underlying memory and may be trivially copied into
- * kernels.*/
-struct EllpackMatrix {
-  EllpackInfo info;
  size_t base_rowid{};
  size_t n_rows{};
  common::CompressedIterator<uint32_t> gidx_iter;
+  /*! \brief Minimum value for each feature. Size equals to number of features. */
+  common::Span<const bst_float> min_fvalue;
+  /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
+  common::Span<const uint32_t> feature_segments;
+  /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
+  common::Span<const bst_float> gidx_fvalue_map;

+  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
+                        bool is_dense, size_t row_stride, size_t base_rowid,
+                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter)
+      : is_dense(is_dense),
+        row_stride(row_stride),
+        base_rowid(base_rowid),
+        n_rows(n_rows) ,gidx_iter(gidx_iter){
+    cuts.cut_values_.SetDevice(device);
+    cuts.cut_ptrs_.SetDevice(device);
+    cuts.min_vals_.SetDevice(device);
+    gidx_fvalue_map = cuts.cut_values_.ConstDeviceSpan();
+    feature_segments = cuts.cut_ptrs_.ConstDeviceSpan();
+    min_fvalue = cuts.min_vals_.ConstDeviceSpan();
+  }
  // Get a matrix element, uses binary search for look up Return NaN if missing
  // Given a row index and a feature index, returns the corresponding cut value
  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
    ridx -= base_rowid;
-    auto row_begin = info.row_stride * ridx;
-    auto row_end = row_begin + info.row_stride;
+    auto row_begin = row_stride * ridx;
+    auto row_end = row_begin + row_stride;
    auto gidx = -1;
-    if (info.is_dense) {
+    if (is_dense) {
      gidx = gidx_iter[row_begin + fidx];
    } else {
      gidx = BinarySearchRow(row_begin,
                             row_end,
                             gidx_iter,
-                             info.feature_segments[fidx],
-                             info.feature_segments[fidx + 1]);
+                             feature_segments[fidx],
+                             feature_segments[fidx + 1]);
    }
    return gidx;
  }
@@ -113,97 +95,27 @@ struct EllpackMatrix {
    if (gidx == -1) {
      return nan("");
    }
-    return info.gidx_fvalue_map[gidx];
+    return gidx_fvalue_map[gidx];
  }

  // Check if the row id is withing range of the current batch.
  __device__ bool IsInRange(size_t row_id) const {
    return row_id >= base_rowid && row_id < base_rowid + n_rows;
  }
+  /*! \brief Return the total number of symbols (total number of bins plus 1 for
+   * not found). */
+  size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
+
+  size_t NullValue() const { return gidx_fvalue_map.size(); }
+
+  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
+
+  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };

-// Instances of this type are created while creating the histogram bins for the
-// entire dataset across multiple sparse page batches. This keeps track of the number
-// of rows to process from a batch and the position from which to process on each device.
-struct RowStateOnDevice {
-  // Number of rows assigned to this device
-  size_t total_rows_assigned_to_device;
-  // Number of rows processed thus far
-  size_t total_rows_processed;
-  // Number of rows to process from the current sparse page batch
-  size_t rows_to_process_from_batch;
-  // Offset from the current sparse page batch to begin processing
-  size_t row_offset_in_current_batch;
-
-  explicit RowStateOnDevice(size_t total_rows)
-      : total_rows_assigned_to_device(total_rows), total_rows_processed(0),
-        rows_to_process_from_batch(0), row_offset_in_current_batch(0) {
-  }
-
-  explicit RowStateOnDevice(size_t total_rows, size_t batch_rows)
-      : total_rows_assigned_to_device(total_rows), total_rows_processed(0),
-        rows_to_process_from_batch(batch_rows), row_offset_in_current_batch(0) {
-  }
-
-  // Advance the row state by the number of rows processed
-  void Advance() {
-    total_rows_processed += rows_to_process_from_batch;
-    CHECK_LE(total_rows_processed, total_rows_assigned_to_device);
-    rows_to_process_from_batch = row_offset_in_current_batch = 0;
-  }
-};
-
-// An instance of this type is created which keeps track of total number of rows to process,
-// rows processed thus far, rows to process and the offset from the current sparse page batch
-// to begin processing on each device
-class DeviceHistogramBuilderState {
- public:
-  explicit DeviceHistogramBuilderState(size_t n_rows) : device_row_state_(n_rows) {}
-
-  const RowStateOnDevice& GetRowStateOnDevice() const {
-    return device_row_state_;
-  }
-
-  // This method is invoked at the beginning of each sparse page batch. This distributes
-  // the rows in the sparse page to the device.
-  // TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins.
-  void BeginBatch(const SparsePage &batch) {
-    size_t rem_rows = batch.Size();
-    size_t row_offset_in_current_batch = 0;
-
-    // Do we have anymore left to process from this batch on this device?
-    if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) {
-      // There are still some rows that needs to be assigned to this device
-      device_row_state_.rows_to_process_from_batch =
-          std::min(
-              device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed,
-              rem_rows);
-    } else {
-      // All rows have been assigned to this device
-      device_row_state_.rows_to_process_from_batch = 0;
-    }
-
-    device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch;
-    row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch;
-    rem_rows -= device_row_state_.rows_to_process_from_batch;
-  }
-
-  // This method is invoked after completion of each sparse page batch
-  void EndBatch() {
-    device_row_state_.Advance();
-  }
-
- private:
-  RowStateOnDevice device_row_state_{0};
-};

 class EllpackPageImpl {
 public:
-  EllpackMatrix matrix;
-  /*! \brief global index of histogram, which is stored in ELLPack format. */
-  common::Span<common::CompressedByteT> gidx_buffer;
-  std::vector<common::CompressedByteT> idx_buffer;
-
  /*!
   * \brief Default constructor.
   *
@@ -218,7 +130,12 @@ class EllpackPageImpl {
   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
   * and the given number of rows.
   */
-  explicit EllpackPageImpl(int device, EllpackInfo info, size_t n_rows);
+  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
+                  size_t row_stride, size_t n_rows);
+
+  EllpackPageImpl(int device, common::HistogramCuts cuts,
+                  const SparsePage& page,
+                  bool is_dense,size_t row_stride);

  /*!
   * \brief Constructor from an existing DMatrix.
@@ -245,77 +162,53 @@ class EllpackPageImpl {
   */
  void Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes);

-  /*!
-   * \brief Initialize the EllpackInfo contained in the EllpackMatrix.
-   *
-   * This is used in the in-memory case. The current page owns the BulkAllocator, which in turn owns
-   * the GPU memory used by the EllpackInfo.
-   *
-   * @param device The GPU device to use.
-   * @param is_dense Whether the matrix is dense.
-   * @param row_stride The number of features between starts of consecutive rows.
-   * @param hmat The histogram cuts of all the features.
-   */
-  void InitInfo(int device, bool is_dense, size_t row_stride, const common::HistogramCuts& hmat);
-
-  /*!
-   * \brief Initialize the buffer to store compressed features.
-   *
-   * @param device The GPU device to use.
-   * @param num_rows The number of rows we are storing in the buffer.
-   */
-  void InitCompressedData(int device, size_t num_rows);
-
-  /*!
-   * \brief Compress a single page of CSR data into ELLPACK.
-   *
-   * @param device The GPU device to use.
-   * @param row_batch The CSR page.
-   * @param device_row_state On-device data for maintaining state.
-   */
-  void CreateHistIndices(int device,
-                         const SparsePage& row_batch,
-                         const RowStateOnDevice& device_row_state);

  /*! \return Number of instances in the page. */
  size_t Size() const;

  /*! \brief Set the base row id for this page. */
-  inline void SetBaseRowId(size_t row_id) {
-    matrix.base_rowid = row_id;
+  void SetBaseRowId(size_t row_id) {
+    base_rowid = row_id;
  }

-  /*! \brief clear the page. */
-  void Clear();
-
-  /*!
-   * \brief Push a sparse page.
-   * \param batch The row page.
-   */
-  void Push(int device, const SparsePage& batch);
-
  /*! \return Estimation of memory cost of this page. */
-  size_t MemCostBytes() const;
+  static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;

-  /*!
-   * \brief Copy the ELLPACK matrix to GPU.
-   *
-   * @param device The GPU device to use.
-   * @param info The EllpackInfo for the matrix.
-   */
-  void InitDevice(int device, EllpackInfo info);

-  /*! \brief Compress the accumulated SparsePage into ELLPACK format.
-   *
-   * @param device The GPU device to use.
-   */
-  void CompressSparsePage(int device);
+  /*! \brief Return the total number of symbols (total number of bins plus 1 for
+   * not found). */
+  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
+
+  EllpackDeviceAccessor GetDeviceAccessor(int device) const;

 private:
+  /*!
+   * \brief Compress a single page of CSR data into ELLPACK.
+   *
+   * @param device The GPU device to use.
+   * @param row_batch The CSR page.
+   */
+  void CreateHistIndices(int device,
+                         const SparsePage& row_batch
+                         );
+  /*!
+   * \brief Initialize the buffer to store compressed features.
+   */
+  void InitCompressedData(int device);
+
+
+public:
+  /*! \brief Whether or not if the matrix is dense. */
+  bool is_dense;
+  /*! \brief Row length for ELLPack. */
+  size_t row_stride;
+  size_t base_rowid{0};
+  size_t n_rows{};
+  /*! \brief global index of histogram, which is stored in ELLPack format. */
+  HostDeviceVector<common::CompressedByteT> gidx_buffer;
+  common::HistogramCuts cuts_;
+private:
  common::Monitor monitor_;
-  dh::BulkAllocator ba_;
-  bool device_initialized_{false};
-  SparsePage sparse_page_{};
 };

 }  // namespace xgboost
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -17,26 +17,35 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
 public:
  bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
    auto* impl = page->Impl();
-    impl->Clear();
-    if (!fi->Read(&impl->matrix.n_rows)) return false;
-    return fi->Read(&impl->idx_buffer);
+    fi->Read(&impl->cuts_.cut_values_.HostVector());
+    fi->Read(&impl->cuts_.cut_ptrs_.HostVector());
+    fi->Read(&impl->cuts_.min_vals_.HostVector());
+    fi->Read(&impl->n_rows);
+    fi->Read(&impl->is_dense);
+    fi->Read(&impl->row_stride);
+    if (!fi->Read(&impl->gidx_buffer.HostVector())) {
+      return false;
+    }
+    return true;
  }

  bool Read(EllpackPage* page,
            dmlc::SeekStream* fi,
            const std::vector<bst_uint>& sorted_index_set) override {
-    auto* impl = page->Impl();
-    impl->Clear();
-    if (!fi->Read(&impl->matrix.n_rows))  return false;
-    return fi->Read(&page->Impl()->idx_buffer);
+    LOG(FATAL) << "Not implemented";
+    return false;
  }

  void Write(const EllpackPage& page, dmlc::Stream* fo) override {
    auto* impl = page.Impl();
-    fo->Write(impl->matrix.n_rows);
-    auto buffer = impl->idx_buffer;
-    CHECK(!buffer.empty());
-    fo->Write(buffer);
+    fo->Write(impl->cuts_.cut_values_.ConstHostVector());
+    fo->Write(impl->cuts_.cut_ptrs_.ConstHostVector());
+    fo->Write(impl->cuts_.min_vals_.ConstHostVector());
+    fo->Write(impl->n_rows);
+    fo->Write(impl->is_dense);
+    fo->Write(impl->row_stride);
+    CHECK(!impl->gidx_buffer.ConstHostVector().empty());
+    fo->Write(impl->gidx_buffer.HostVector());
  }
 };

--- a/src/data/ellpack_page_source.cc
+++ b/src/data/ellpack_page_source.cc
@@ -2,45 +2,23 @@
 * Copyright 2019 XGBoost contributors
 */
 #ifndef XGBOOST_USE_CUDA
+#include <dmlc/base.h>
+#if DMLC_ENABLE_STD_THREAD

-#include <xgboost/data.h>
 #include "ellpack_page_source.h"
+#include <xgboost/data.h>
 namespace xgboost {
 namespace data {

 EllpackPageSource::EllpackPageSource(DMatrix* dmat,
                                     const std::string& cache_info,
                                     const BatchParam& param) noexcept(false) {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-}
-
-void EllpackPageSource::BeforeFirst() {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-}
-
-bool EllpackPageSource::Next() {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-  return false;
-}
-
-EllpackPage& EllpackPageSource::Value() {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-  EllpackPage* page { nullptr };
-  return *page;
-}
-
-const EllpackPage& EllpackPageSource::Value() const {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-  EllpackPage* page { nullptr };
-  return *page;
+  LOG(FATAL)
+      << "Internal Error: "
+         "XGBoost is not compiled with CUDA but EllpackPageSource is required";
 }

 }  // namespace data
 }  // namespace xgboost
-
+#endif  // DMLC_ENABLE_STD_THREAD
 #endif  // XGBOOST_USE_CUDA
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -3,73 +3,16 @@
 */
 #include <memory>
 #include <utility>
-#include <vector>

 #include "../common/hist_util.h"

+#include "ellpack_page.cuh"
 #include "ellpack_page_source.h"
 #include "sparse_page_source.h"
-#include "ellpack_page.cuh"

 namespace xgboost {
 namespace data {

-class EllpackPageSourceImpl : public DataSource<EllpackPage> {
- public:
-  /*!
-   * \brief Create source from cache files the cache_prefix.
-   * \param cache_prefix The prefix of cache we want to solve.
-   */
-  explicit EllpackPageSourceImpl(DMatrix* dmat,
-                                 const std::string& cache_info,
-                                 const BatchParam& param) noexcept(false);
-
-  /*! \brief destructor */
-  ~EllpackPageSourceImpl() override = default;
-
-  void BeforeFirst() override;
-  bool Next() override;
-  EllpackPage& Value();
-  const EllpackPage& Value() const override;
-
- private:
-  /*! \brief Write Ellpack pages after accumulating them in memory. */
-  void WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const;
-
-  /*! \brief The page type string for ELLPACK. */
-  const std::string kPageType_{".ellpack.page"};
-
-  int device_{-1};
-  size_t page_size_{DMatrix::kPageSize};
-  common::Monitor monitor_;
-  dh::BulkAllocator ba_;
-  /*! \brief The EllpackInfo, with the underlying GPU memory shared by all pages. */
-  EllpackInfo ellpack_info_;
-  std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> source_;
-  std::string cache_info_;
-};
-
-EllpackPageSource::EllpackPageSource(DMatrix* dmat,
-                                     const std::string& cache_info,
-                                     const BatchParam& param) noexcept(false)
-    : impl_{new EllpackPageSourceImpl(dmat, cache_info, param)} {}
-
-void EllpackPageSource::BeforeFirst() {
-  impl_->BeforeFirst();
-}
-
-bool EllpackPageSource::Next() {
-  return impl_->Next();
-}
-
-EllpackPage& EllpackPageSource::Value() {
-  return impl_->Value();
-}
-
-const EllpackPage& EllpackPageSource::Value() const {
-  return impl_->Value();
-}
-
 size_t GetRowStride(DMatrix* dmat) {
  if (dmat->IsDense()) return dmat->Info().num_col_;

@@ -86,17 +29,19 @@ size_t GetRowStride(DMatrix* dmat) {

 // Build the quantile sketch across the whole input data, then use the histogram cuts to compress
 // each CSR page, and write the accumulated ELLPACK pages to disk.
-EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
-                                             const std::string& cache_info,
-                                             const BatchParam& param) noexcept(false)
-    : device_(param.gpu_id), cache_info_(cache_info) {
-
+EllpackPageSource::EllpackPageSource(DMatrix* dmat,
+                                     const std::string& cache_info,
+                                     const BatchParam& param) noexcept(false) {
+  cache_info_ = ParseCacheInfo(cache_info, kPageType_);
+  for (auto file : cache_info_.name_shards) {
+    CheckCacheFileExists(file);
+  }
  if (param.gpu_page_size > 0) {
    page_size_ = param.gpu_page_size;
  }

  monitor_.Init("ellpack_page_source");
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(param.gpu_id));

  monitor_.StartCuda("Quantiles");
  size_t row_stride = GetRowStride(dmat);
@@ -104,75 +49,52 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
                                   param.gpu_batch_nrows);
  monitor_.StopCuda("Quantiles");

-  monitor_.StartCuda("CreateEllpackInfo");
-  ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, cuts, &ba_);
-  monitor_.StopCuda("CreateEllpackInfo");
-
  monitor_.StartCuda("WriteEllpackPages");
-  WriteEllpackPages(dmat, cache_info);
+  WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
  monitor_.StopCuda("WriteEllpackPages");

-  source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
-      ParseCacheInfo(cache_info_, kPageType_)));
-}
-
-void EllpackPageSourceImpl::BeforeFirst() {
-  source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
-      ParseCacheInfo(cache_info_, kPageType_)));
-  source_->BeforeFirst();
-}
-
-bool EllpackPageSourceImpl::Next() {
-  return source_->Next();
-}
-
-EllpackPage& EllpackPageSourceImpl::Value() {
-  EllpackPage& page = source_->Value();
-  page.Impl()->InitDevice(device_, ellpack_info_);
-  return page;
-}
-
-const EllpackPage& EllpackPageSourceImpl::Value() const {
-  EllpackPage& page = source_->Value();
-  page.Impl()->InitDevice(device_, ellpack_info_);
-  return page;
+  external_prefetcher_.reset(
+      new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
 }

 // Compress each CSR page to ELLPACK, and write the accumulated pages to disk.
-void EllpackPageSourceImpl::WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const {
+void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat,
+                                          const common::HistogramCuts& cuts,
+                                          const std::string& cache_info,
+                                          size_t row_stride) const {
  auto cinfo = ParseCacheInfo(cache_info, kPageType_);
  const size_t extra_buffer_capacity = 6;
-  SparsePageWriter<EllpackPage> writer(
-      cinfo.name_shards, cinfo.format_shards, extra_buffer_capacity);
+  SparsePageWriter<EllpackPage> writer(cinfo.name_shards, cinfo.format_shards,
+                                       extra_buffer_capacity);
  std::shared_ptr<EllpackPage> page;
+  SparsePage temp_host_page;
  writer.Alloc(&page);
  auto* impl = page->Impl();
-  impl->matrix.info = ellpack_info_;
-  impl->Clear();

-  const MetaInfo& info = dmat->Info();
  size_t bytes_write = 0;
  double tstart = dmlc::GetTime();
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    impl->Push(device_, batch);
+    temp_host_page.Push(batch);

-    size_t mem_cost_bytes = impl->MemCostBytes();
+    size_t mem_cost_bytes =
+        EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts);
    if (mem_cost_bytes >= page_size_) {
      bytes_write += mem_cost_bytes;
-      impl->CompressSparsePage(device_);
+      *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
+                              row_stride);
      writer.PushWrite(std::move(page));
      writer.Alloc(&page);
      impl = page->Impl();
-      impl->matrix.info = ellpack_info_;
-      impl->Clear();
+      temp_host_page.Clear();
      double tdiff = dmlc::GetTime() - tstart;
      LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
                << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
                << (bytes_write >> 20UL) << " written";
    }
  }
-  if (impl->Size() != 0) {
-    impl->CompressSparsePage(device_);
+  if (temp_host_page.Size() != 0) {
+    *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
+                            row_stride);
    writer.PushWrite(std::move(page));
  }
 }
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -10,19 +10,17 @@
 #include <string>

 #include "../common/timer.h"
+#include "../common/hist_util.h"
+#include "sparse_page_source.h"

 namespace xgboost {
 namespace data {

-class EllpackPageSourceImpl;
-
 /*!
 * \brief External memory data source for ELLPACK format.
 *
- * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
- * including CUDA-specific implementation details in the header.
 */
-class EllpackPageSource : public DataSource<EllpackPage> {
+class EllpackPageSource {
 public:
  /*!
   * \brief Create source from cache files the cache_prefix.
@@ -32,19 +30,33 @@ class EllpackPageSource : public DataSource<EllpackPage> {
                             const std::string& cache_info,
                             const BatchParam& param) noexcept(false);

-  /*! \brief destructor */
-  ~EllpackPageSource() override = default;
+  BatchSet<EllpackPage> GetBatchSet() {
+    auto begin_iter = BatchIterator<EllpackPage>(
+        new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<EllpackPage>,
+                                    EllpackPage>(external_prefetcher_.get()));
+    return BatchSet<EllpackPage>(begin_iter);
+  }

-  void BeforeFirst() override;
-  bool Next() override;
-  EllpackPage& Value();
-  const EllpackPage& Value() const override;
-
-  const EllpackPageSourceImpl* Impl() const { return impl_.get(); }
-  EllpackPageSourceImpl* Impl() { return impl_.get(); }
+  ~EllpackPageSource() {
+    external_prefetcher_.reset();
+    for (auto file : cache_info_.name_shards) {
+      TryDeleteCacheFile(file);
+    }
+  }

 private:
-  std::shared_ptr<EllpackPageSourceImpl> impl_;
+  void WriteEllpackPages(int device, DMatrix* dmat,
+                         const common::HistogramCuts& cuts,
+                         const std::string& cache_info,
+                         size_t row_stride) const;
+
+  /*! \brief The page type string for ELLPACK. */
+  const std::string kPageType_{".ellpack.page"};
+
+  size_t page_size_{DMatrix::kPageSize};
+  common::Monitor monitor_;
+  std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> external_prefetcher_;
+  CacheInfo cache_info_;
 };

 }  // namespace data
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -51,11 +51,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
    ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
    batch_param_ = param;
  }
-  ellpack_source_->BeforeFirst();
-  ellpack_source_->Next();
-  auto begin_iter = BatchIterator<EllpackPage>(
-      new SparseBatchIteratorImpl<EllpackPageSource, EllpackPage>(ellpack_source_.get()));
-  return BatchSet<EllpackPage>(begin_iter);
+  return ellpack_source_->GetBatchSet();
 }

 }  // namespace data