Partial rewrite EllpackPage (#5352)

2020-03-11 10:15:53 +13:00 · 2020-03-11 10:15:53 +13:00 · 3ad4333b0e
commit 3ad4333b0e
parent 7a99f8f27f
23 changed files with 496 additions and 733 deletions
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@ -181,13 +181,13 @@ class CompressedIterator {
  typedef value_type reference;             // NOLINT
 private:
-  CompressedByteT *buffer_;
+  const CompressedByteT *buffer_;
  size_t symbol_bits_;
  size_t offset_;
 public:
  CompressedIterator() : buffer_(nullptr), symbol_bits_(0), offset_(0) {}
-  CompressedIterator(CompressedByteT *buffer, size_t num_symbols)
+  CompressedIterator(const CompressedByteT *buffer, size_t num_symbols)
      : buffer_(buffer), offset_(0) {
    symbol_bits_ = detail::SymbolBits(num_symbols);
  }
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -31,7 +31,7 @@ namespace common {
 HistogramCuts::HistogramCuts() {
  monitor_.Init(__FUNCTION__);
-  cut_ptrs_.emplace_back(0);
+  cut_ptrs_.HostVector().emplace_back(0);
 }
 // Dispatch to specific builder.
@ -52,7 +52,7 @@ void HistogramCuts::Build(DMatrix* dmat, uint32_t const max_num_bins) {
    DenseCuts cuts(this);
    cuts.Build(dmat, max_num_bins);
  }
-  LOG(INFO) << "Total number of hist bins: " << cut_ptrs_.back();
+  LOG(INFO) << "Total number of hist bins: " << cut_ptrs_.HostVector().back();
 }
 bool CutsBuilder::UseGroup(DMatrix* dmat) {
@ -75,7 +75,10 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
  // Data groups, used in ranking.
  std::vector<bst_uint> const& group_ptr = info.group_ptr_;
-  p_cuts_->min_vals_.resize(end_col - beg_col, 0);
+  auto &local_min_vals = p_cuts_->min_vals_.HostVector();
  auto &local_cuts = p_cuts_->cut_values_.HostVector();
  auto &local_ptrs = p_cuts_->cut_ptrs_.HostVector();
  local_min_vals.resize(end_col - beg_col, 0);
  for (uint32_t col_id = beg_col; col_id < page.Size() && col_id < end_col; ++col_id) {
    // Using a local variable makes things easier, but at the cost of memory trashing.
@ -85,7 +88,7 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
                                     max_num_bins);
    if (n_bins == 0) {
      // cut_ptrs_ is initialized with a zero, so there's always an element at the back
-      p_cuts_->cut_ptrs_.emplace_back(p_cuts_->cut_ptrs_.back());
+      local_ptrs.emplace_back(local_ptrs.back());
      continue;
    }
@ -112,17 +115,17 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
    // Can be use data[1] as the min values so that we don't need to
    // store another array?
    float mval = summary.data[0].value;
-    p_cuts_->min_vals_[col_id - beg_col]  = mval - (fabs(mval) + 1e-5);
+    local_min_vals[col_id - beg_col]  = mval - (fabs(mval) + 1e-5);
    this->AddCutPoint(summary, max_num_bins);
    bst_float cpt = (summary.size > 0) ?
                    summary.data[summary.size - 1].value :
-                    p_cuts_->min_vals_[col_id - beg_col];
+                    local_min_vals[col_id - beg_col];
    cpt += fabs(cpt) + 1e-5;
-    p_cuts_->cut_values_.emplace_back(cpt);
+    local_cuts.emplace_back(cpt);
-    p_cuts_->cut_ptrs_.emplace_back(p_cuts_->cut_values_.size());
+    local_ptrs.emplace_back(local_cuts.size());
  }
 }
@ -196,33 +199,40 @@ void SparseCuts::Concat(
    std::vector<std::unique_ptr<SparseCuts>> const& cuts, uint32_t n_cols) {
  monitor_.Start(__FUNCTION__);
  uint32_t nthreads = omp_get_max_threads();
-  p_cuts_->min_vals_.resize(n_cols, std::numeric_limits<float>::max());
+  auto &local_min_vals = p_cuts_->min_vals_.HostVector();
  auto &local_cuts = p_cuts_->cut_values_.HostVector();
  auto &local_ptrs = p_cuts_->cut_ptrs_.HostVector();
  local_min_vals.resize(n_cols, std::numeric_limits<float>::max());
  size_t min_vals_tail = 0;
  for (uint32_t t = 0; t < nthreads; ++t) {
    auto& thread_min_vals = cuts[t]->p_cuts_->min_vals_.HostVector();
    auto& thread_cuts = cuts[t]->p_cuts_->cut_values_.HostVector();
    auto& thread_ptrs = cuts[t]->p_cuts_->cut_ptrs_.HostVector();
    // concat csc pointers.
-    size_t const old_ptr_size = p_cuts_->cut_ptrs_.size();
+    size_t const old_ptr_size = local_ptrs.size();
-    p_cuts_->cut_ptrs_.resize(
+    local_ptrs.resize(
-        cuts[t]->p_cuts_->cut_ptrs_.size() + p_cuts_->cut_ptrs_.size() - 1);
+        thread_ptrs.size() + local_ptrs.size() - 1);
-    size_t const new_icp_size = p_cuts_->cut_ptrs_.size();
+    size_t const new_icp_size = local_ptrs.size();
-    auto tail = p_cuts_->cut_ptrs_[old_ptr_size-1];
+    auto tail = local_ptrs[old_ptr_size-1];
    for (size_t j = old_ptr_size; j < new_icp_size; ++j) {
-      p_cuts_->cut_ptrs_[j] = tail + cuts[t]->p_cuts_->cut_ptrs_[j-old_ptr_size+1];
+      local_ptrs[j] = tail + thread_ptrs[j-old_ptr_size+1];
    }
    // concat csc values
-    size_t const old_iv_size = p_cuts_->cut_values_.size();
+    size_t const old_iv_size = local_cuts.size();
-    p_cuts_->cut_values_.resize(
+    local_cuts.resize(
-        cuts[t]->p_cuts_->cut_values_.size() + p_cuts_->cut_values_.size());
+        thread_cuts.size() + local_cuts.size());
-    size_t const new_iv_size = p_cuts_->cut_values_.size();
+    size_t const new_iv_size = local_cuts.size();
    for (size_t j = old_iv_size; j < new_iv_size; ++j) {
-      p_cuts_->cut_values_[j] = cuts[t]->p_cuts_->cut_values_[j-old_iv_size];
+      local_cuts[j] = thread_cuts[j-old_iv_size];
    }
    // merge min values
-    for (size_t j = 0; j < cuts[t]->p_cuts_->min_vals_.size(); ++j) {
+    for (size_t j = 0; j < thread_min_vals.size(); ++j) {
-      p_cuts_->min_vals_.at(min_vals_tail + j) =
+       local_min_vals.at(min_vals_tail + j) =
-          std::min(p_cuts_->min_vals_.at(min_vals_tail + j), cuts.at(t)->p_cuts_->min_vals_.at(j));
+          std::min(local_min_vals.at(min_vals_tail + j), thread_min_vals.at(j));
    }
-    min_vals_tail += cuts[t]->p_cuts_->min_vals_.size();
+    min_vals_tail += thread_min_vals.size();
  }
  monitor_.Stop(__FUNCTION__);
 }
@ -323,27 +333,27 @@ void DenseCuts::Init
  // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
  // we need to move this allreduce before loadcheckpoint call in future
  sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
-  p_cuts_->min_vals_.resize(sketchs.size());
+  p_cuts_->min_vals_.HostVector().resize(sketchs.size());
  for (size_t fid = 0; fid < summary_array.size(); ++fid) {
    WQSketch::SummaryContainer a;
    a.Reserve(max_num_bins + 1);
    a.SetPrune(summary_array[fid], max_num_bins + 1);
    const bst_float mval = a.data[0].value;
-    p_cuts_->min_vals_[fid] = mval - (fabs(mval) + 1e-5);
+    p_cuts_->min_vals_.HostVector()[fid] = mval - (fabs(mval) + 1e-5);
    AddCutPoint(a, max_num_bins);
    // push a value that is greater than anything
    const bst_float cpt
-      = (a.size > 0) ? a.data[a.size - 1].value : p_cuts_->min_vals_[fid];
+      = (a.size > 0) ? a.data[a.size - 1].value : p_cuts_->min_vals_.HostVector()[fid];
    // this must be bigger than last value in a scale
    const bst_float last = cpt + (fabs(cpt) + 1e-5);
-    p_cuts_->cut_values_.push_back(last);
+    p_cuts_->cut_values_.HostVector().push_back(last);
    // Ensure that every feature gets at least one quantile point
-    CHECK_LE(p_cuts_->cut_values_.size(), std::numeric_limits<uint32_t>::max());
+    CHECK_LE(p_cuts_->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
-    auto cut_size = static_cast<uint32_t>(p_cuts_->cut_values_.size());
+    auto cut_size = static_cast<uint32_t>(p_cuts_->cut_values_.HostVector().size());
-    CHECK_GT(cut_size, p_cuts_->cut_ptrs_.back());
+    CHECK_GT(cut_size, p_cuts_->cut_ptrs_.HostVector().back());
-    p_cuts_->cut_ptrs_.push_back(cut_size);
+    p_cuts_->cut_ptrs_.HostVector().push_back(cut_size);
  }
  monitor_.Stop(__func__);
 }
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@ -44,17 +44,35 @@ class HistogramCuts {
  using BinIdx = uint32_t;
  common::Monitor monitor_;
  std::vector<bst_float> cut_values_;
  std::vector<uint32_t> cut_ptrs_;
  std::vector<float> min_vals_;  // storing minimum value in a sketch set.
 public:
  HostDeviceVector<bst_float> cut_values_;
  HostDeviceVector<uint32_t> cut_ptrs_;
  HostDeviceVector<float> min_vals_;  // storing minimum value in a sketch set.
  HistogramCuts();
-  HistogramCuts(HistogramCuts const& that) = delete;
+  HistogramCuts(HistogramCuts const& that) {
    cut_values_.Resize(that.cut_values_.Size());
    cut_ptrs_.Resize(that.cut_ptrs_.Size());
    min_vals_.Resize(that.min_vals_.Size());
    cut_values_.Copy(that.cut_values_);
    cut_ptrs_.Copy(that.cut_ptrs_);
    min_vals_.Copy(that.min_vals_);
  }
  HistogramCuts(HistogramCuts&& that) noexcept(true) {
    *this = std::forward<HistogramCuts&&>(that);
  }
-  HistogramCuts& operator=(HistogramCuts const& that) = delete;
+
  HistogramCuts& operator=(HistogramCuts const& that) {
    cut_values_.Resize(that.cut_values_.Size());
    cut_ptrs_.Resize(that.cut_ptrs_.Size());
    min_vals_.Resize(that.min_vals_.Size());
    cut_values_.Copy(that.cut_values_);
    cut_ptrs_.Copy(that.cut_ptrs_);
    min_vals_.Copy(that.min_vals_);
    return *this;
  }
  HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
    monitor_ = std::move(that.monitor_);
    cut_ptrs_ = std::move(that.cut_ptrs_);
@ -67,28 +85,30 @@ class HistogramCuts {
  void Build(DMatrix* dmat, uint32_t const max_num_bins);
  /* \brief How many bins a feature has. */
  uint32_t FeatureBins(uint32_t feature) const {
-    return cut_ptrs_.at(feature+1) - cut_ptrs_[feature];
+    return cut_ptrs_.ConstHostVector().at(feature + 1) -
           cut_ptrs_.ConstHostVector()[feature];
  }
  // Getters.  Cuts should be of no use after building histogram indices, but currently
  // it's deeply linked with quantile_hist, gpu sketcher and gpu_hist.  So we preserve
  // these for now.
-  std::vector<uint32_t> const& Ptrs()      const { return cut_ptrs_;   }
+  std::vector<uint32_t> const& Ptrs()      const { return cut_ptrs_.ConstHostVector();   }
-  std::vector<float>    const& Values()    const { return cut_values_; }
+  std::vector<float>    const& Values()    const { return cut_values_.ConstHostVector(); }
-  std::vector<float>    const& MinValues() const { return min_vals_;   }
+  std::vector<float>    const& MinValues() const { return min_vals_.ConstHostVector();   }
-  size_t TotalBins() const { return cut_ptrs_.back(); }
+  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
  // Return the index of a cut point that is strictly greater than the input
  // value, or the last available index if none exists
  BinIdx SearchBin(float value, uint32_t column_id) const {
-    auto beg = cut_ptrs_.at(column_id);
+    auto beg = cut_ptrs_.ConstHostVector().at(column_id);
-    auto end = cut_ptrs_.at(column_id + 1);
+    auto end = cut_ptrs_.ConstHostVector().at(column_id + 1);
-    auto it = std::upper_bound(cut_values_.cbegin() + beg, cut_values_.cbegin() + end, value);
+    const auto &values = cut_values_.ConstHostVector();
-    if (it == cut_values_.cend()) {
+    auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
-      it = cut_values_.cend() - 1;
+    if (it == values.cend()) {
      it = values.cend() - 1;
    }
-    BinIdx idx = it - cut_values_.cbegin();
+    BinIdx idx = it - values.cbegin();
    return idx;
  }
@ -133,8 +153,8 @@ class CutsBuilder {
    size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
    for (size_t i = 1; i < required_cuts; ++i) {
      bst_float cpt = summary.data[i].value;
-      if (i == 1 || cpt > p_cuts_->cut_values_.back()) {
+      if (i == 1 || cpt > p_cuts_->cut_values_.ConstHostVector().back()) {
-        p_cuts_->cut_values_.push_back(cpt);
+        p_cuts_->cut_values_.HostVector().push_back(cpt);
      }
    }
  }
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -371,6 +371,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@ -13,11 +13,24 @@ class EllpackPageImpl {};
 EllpackPage::EllpackPage() = default;
 EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) {
-  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
                "EllpackPage is required";
 }
 EllpackPage::~EllpackPage() {
-  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
                "EllpackPage is required";
 }
 void EllpackPage::SetBaseRowId(size_t row_id) {
  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
                "EllpackPage is required";
 }
 size_t EllpackPage::Size() const {
  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
                "EllpackPage is required";
  return 0;
 }
 }  // namespace xgboost
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@ -4,9 +4,9 @@
 #include <xgboost/data.h>
 #include "./ellpack_page.cuh"
 #include "../common/hist_util.h"
 #include "../common/random.h"
 #include "./ellpack_page.cuh"
 namespace xgboost {
@ -17,13 +17,9 @@ EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param)
 EllpackPage::~EllpackPage() = default;
-size_t EllpackPage::Size() const {
+size_t EllpackPage::Size() const { return impl_->Size(); }
  return impl_->Size();
 }
-void EllpackPage::SetBaseRowId(size_t row_id) {
+void EllpackPage::SetBaseRowId(size_t row_id) { impl_->SetBaseRowId(row_id); }
  impl_->SetBaseRowId(row_id);
 }
 // Bin each input data entry, store the bin indices in compressed form.
 __global__ void CompressBinEllpackKernel(
@ -65,16 +61,18 @@ __global__ void CompressBinEllpackKernel(
 }
 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, EllpackInfo info, size_t n_rows) {
+EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
                                 bool is_dense, size_t row_stride,
                                 size_t n_rows)
    : is_dense(is_dense),
      cuts_(std::move(cuts)),
      row_stride(row_stride),
      n_rows(n_rows) {
  monitor_.Init("ellpack_page");
  dh::safe_cuda(cudaSetDevice(device));
  matrix.info = info;
  matrix.base_rowid = 0;
  matrix.n_rows = n_rows;
  monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(device, n_rows);
+  InitCompressedData(device);
  monitor_.StopCuda("InitCompressedData");
 }
@ -93,33 +91,27 @@ size_t GetRowStride(DMatrix* dmat) {
 }
 // Construct an ELLPACK matrix in memory.
-EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
+EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
    : is_dense(dmat->IsDense()) {
  monitor_.Init("ellpack_page");
  dh::safe_cuda(cudaSetDevice(param.gpu_id));
-  matrix.n_rows = dmat->Info().num_row_;
+  n_rows = dmat->Info().num_row_;
  monitor_.StartCuda("Quantiles");
  // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
-  size_t row_stride = GetRowStride(dmat);
+  row_stride = GetRowStride(dmat);
-  auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
+  cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
                                   param.gpu_batch_nrows);
  monitor_.StopCuda("Quantiles");
  monitor_.StartCuda("InitEllpackInfo");
  InitInfo(param.gpu_id, dmat->IsDense(), row_stride, cuts);
  monitor_.StopCuda("InitEllpackInfo");
  monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(param.gpu_id, dmat->Info().num_row_);
+  InitCompressedData(param.gpu_id);
  monitor_.StopCuda("InitCompressedData");
  monitor_.StartCuda("BinningCompression");
  DeviceHistogramBuilderState hist_builder_row_state(dmat->Info().num_row_);
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    hist_builder_row_state.BeginBatch(batch);
+    CreateHistIndices(param.gpu_id, batch);
    CreateHistIndices(param.gpu_id, batch, hist_builder_row_state.GetRowStateOnDevice());
    hist_builder_row_state.EndBatch();
  }
  monitor_.StopCuda("BinningCompression");
 }
@ -133,23 +125,26 @@ struct CopyPage {
  size_t offset;
  CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset)
-      : cbw{dst->matrix.info.NumSymbols()},
+      : cbw{dst->NumSymbols()},
-        dst_data_d{dst->gidx_buffer.data()},
+        dst_data_d{dst->gidx_buffer.DevicePointer()},
-        src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
+        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
        offset(offset) {}
  __device__ void operator()(size_t element_id) {
-    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
+    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
                          element_id + offset);
  }
 };
 // Copy the data from the given EllpackPage to the current page.
 size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
  monitor_.StartCuda("Copy");
-  size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
+  size_t num_elements = page->n_rows * page->row_stride;
-  CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
+  CHECK_EQ(row_stride, page->row_stride);
-  CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
+  CHECK_EQ(NumSymbols(), page->NumSymbols());
-  CHECK_GE(matrix.n_rows * matrix.info.row_stride, offset + num_elements);
+  CHECK_GE(n_rows * row_stride, offset + num_elements);
  gidx_buffer.SetDevice(device);
  page->gidx_buffer.SetDevice(device);
  dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
  monitor_.StopCuda("Copy");
  return num_elements;
@ -160,26 +155,29 @@ struct CompactPage {
  common::CompressedBufferWriter cbw;
  common::CompressedByteT* dst_data_d;
  common::CompressedIterator<uint32_t> src_iterator_d;
-  /*! \brief An array that maps the rows from the full DMatrix to the compacted page.
+  /*! \brief An array that maps the rows from the full DMatrix to the compacted
   * page.
   *
-   * The total size is the number of rows in the original, uncompacted DMatrix. Elements are the
+   * The total size is the number of rows in the original, uncompacted DMatrix.
-   * row ids in the compacted page. Rows not needed are set to SIZE_MAX.
+   * Elements are the row ids in the compacted page. Rows not needed are set to
   * SIZE_MAX.
   *
   * An example compacting 16 rows to 8 rows:
-   * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6, SIZE_MAX, 7, SIZE_MAX,
+   * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6,
-   * SIZE_MAX]
+   * SIZE_MAX, 7, SIZE_MAX, SIZE_MAX]
   */
  common::Span<size_t> row_indexes;
  size_t base_rowid;
  size_t row_stride;
-  CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, common::Span<size_t> row_indexes)
+  CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src,
-      : cbw{dst->matrix.info.NumSymbols()},
+              common::Span<size_t> row_indexes)
-        dst_data_d{dst->gidx_buffer.data()},
+      : cbw{dst->NumSymbols()},
-        src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
+        dst_data_d{dst->gidx_buffer.DevicePointer()},
        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
        row_indexes(row_indexes),
-        base_rowid{src->matrix.base_rowid},
+        base_rowid{src->base_rowid},
-        row_stride{src->matrix.info.row_stride} {}
+        row_stride{src->row_stride} {}
  __device__ void operator()(size_t row_id) {
    size_t src_row = base_rowid + row_id;
@ -188,100 +186,72 @@ struct CompactPage {
    size_t dst_offset = dst_row * row_stride;
    size_t src_offset = row_id * row_stride;
    for (size_t j = 0; j < row_stride; j++) {
-      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
+      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
                            dst_offset + j);
    }
  }
 };
 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes) {
+void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
                              common::Span<size_t> row_indexes) {
  monitor_.StartCuda("Compact");
-  CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
+  CHECK_EQ(row_stride, page->row_stride);
-  CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
+  CHECK_EQ(NumSymbols(), page->NumSymbols());
-  CHECK_LE(page->matrix.base_rowid + page->matrix.n_rows, row_indexes.size());
+  CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
-  dh::LaunchN(device, page->matrix.n_rows, CompactPage(this, page, row_indexes));
+  gidx_buffer.SetDevice(device);
  page->gidx_buffer.SetDevice(device);
  dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
  monitor_.StopCuda("Compact");
 }
 // Construct an EllpackInfo based on histogram cuts of features.
 EllpackInfo::EllpackInfo(int device,
                         bool is_dense,
                         size_t row_stride,
                         const common::HistogramCuts& hmat,
                         dh::BulkAllocator* ba)
    : is_dense(is_dense), row_stride(row_stride), n_bins(hmat.Ptrs().back()) {
  ba->Allocate(device,
               &feature_segments, hmat.Ptrs().size(),
               &gidx_fvalue_map, hmat.Values().size(),
               &min_fvalue, hmat.MinValues().size());
  dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
  dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
  dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
 }
 // Initialize the EllpackInfo for this page.
 void EllpackPageImpl::InitInfo(int device,
                               bool is_dense,
                               size_t row_stride,
                               const common::HistogramCuts& hmat) {
  matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, &ba_);
 }
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device, size_t num_rows) {
+void EllpackPageImpl::InitCompressedData(int device) {
-  size_t num_symbols = matrix.info.NumSymbols();
+  size_t num_symbols = NumSymbols();
  // Required buffer size for storing data matrix in ELLPack format.
-  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
+  size_t compressed_size_bytes =
-      matrix.info.row_stride * num_rows, num_symbols);
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
-  ba_.Allocate(device, &gidx_buffer, compressed_size_bytes);
+                                                          num_symbols);
-
+  gidx_buffer.SetDevice(device);
  // Don't call fill unnecessarily
  if (gidx_buffer.Size() == 0) {
    gidx_buffer.Resize(compressed_size_bytes, 0);
  } else {
    gidx_buffer.Resize(compressed_size_bytes, 0);
    thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
-
+  }
  matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols);
 }
 // Compress a CSR page into ELLPACK.
 void EllpackPageImpl::CreateHistIndices(int device,
-                                        const SparsePage& row_batch,
+                                        const SparsePage& row_batch) {
-                                        const RowStateOnDevice& device_row_state) {
+  if (row_batch.Size() == 0) return;
-  // Has any been allocated for me in this batch?
+  unsigned int null_gidx_value = NumSymbols() - 1;
  if (!device_row_state.rows_to_process_from_batch) return;
  unsigned int null_gidx_value = matrix.info.n_bins;
  size_t row_stride = matrix.info.row_stride;
  const auto& offset_vec = row_batch.offset.ConstHostVector();
  // bin and compress entries in batches of rows
-  size_t gpu_batch_nrows = std::min(
+  size_t gpu_batch_nrows =
-      dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
-      static_cast<size_t>(device_row_state.rows_to_process_from_batch));
+               static_cast<size_t>(row_batch.Size()));
  const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
-  size_t gpu_nbatches = common::DivRoundUp(device_row_state.rows_to_process_from_batch,
+  size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
                                           gpu_batch_nrows);
  for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
    size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
-    size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
+    size_t batch_row_end =
-    if (batch_row_end > device_row_state.rows_to_process_from_batch) {
+        std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size());
      batch_row_end = device_row_state.rows_to_process_from_batch;
    }
    size_t batch_nrows = batch_row_end - batch_row_begin;
-    const auto ent_cnt_begin =
+    const auto ent_cnt_begin = offset_vec[batch_row_begin];
-        offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin];
+    const auto ent_cnt_end = offset_vec[batch_row_end];
    const auto ent_cnt_end =
        offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end];
    /*! \brief row offset in SparsePage (the input data). */
    dh::device_vector<size_t> row_ptrs(batch_nrows + 1);
-    thrust::copy(
+    thrust::copy(offset_vec.data() + batch_row_begin,
-        offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin,
+                 offset_vec.data() + batch_row_end + 1, row_ptrs.begin());
        offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1,
        row_ptrs.begin());
    // number of entries in this batch.
    size_t n_entries = ent_cnt_end - ent_cnt_begin;
@ -289,97 +259,50 @@ void EllpackPageImpl::CreateHistIndices(int device,
    // copy data entries to device.
    dh::safe_cuda(cudaMemcpy(entries_d.data().get(),
                             data_vec.data() + ent_cnt_begin,
-                             n_entries * sizeof(Entry),
+                             n_entries * sizeof(Entry), cudaMemcpyDefault));
                             cudaMemcpyDefault));
    const dim3 block3(32, 8, 1);  // 256 threads
    const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
-                     common::DivRoundUp(row_stride, block3.y),
+                     common::DivRoundUp(row_stride, block3.y), 1);
-                     1);
+    auto device_accessor = GetDeviceAccessor(device);
    dh::LaunchKernel {grid3, block3}(
-        CompressBinEllpackKernel,
+        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
-        common::CompressedBufferWriter(matrix.info.NumSymbols()),
+        gidx_buffer.DevicePointer(), row_ptrs.data().get(),
-        gidx_buffer.data(),
+        entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
-        row_ptrs.data().get(),
+        device_accessor.feature_segments.data(),
-        entries_d.data().get(),
+        row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride,
        matrix.info.gidx_fvalue_map.data(),
        matrix.info.feature_segments.data(),
        device_row_state.total_rows_processed + batch_row_begin,
        batch_nrows,
        row_stride,
        null_gidx_value);
  }
 }
 // Return the number of rows contained in this page.
-size_t EllpackPageImpl::Size() const {
+size_t EllpackPageImpl::Size() const { return n_rows; }
  return matrix.n_rows;
 }
 // Clear the current page.
 void EllpackPageImpl::Clear() {
  ba_.Clear();
  gidx_buffer = {};
  idx_buffer.clear();
  sparse_page_.Clear();
  matrix.base_rowid = 0;
  matrix.n_rows = 0;
  device_initialized_ = false;
 }
 // Push a CSR page to the current page.
 //
 // The CSR pages are accumulated in memory until they reach a certain size, then written out as
 // compressed ELLPACK.
 void EllpackPageImpl::Push(int device, const SparsePage& batch) {
  sparse_page_.Push(batch);
  matrix.n_rows += batch.Size();
 }
 // Compress the accumulated SparsePage.
 void EllpackPageImpl::CompressSparsePage(int device) {
  monitor_.StartCuda("InitCompressedData");
  InitCompressedData(device, matrix.n_rows);
  monitor_.StopCuda("InitCompressedData");
  monitor_.StartCuda("BinningCompression");
  DeviceHistogramBuilderState hist_builder_row_state(matrix.n_rows);
  hist_builder_row_state.BeginBatch(sparse_page_);
  CreateHistIndices(device, sparse_page_, hist_builder_row_state.GetRowStateOnDevice());
  hist_builder_row_state.EndBatch();
  monitor_.StopCuda("BinningCompression");
  monitor_.StartCuda("CopyDeviceToHost");
  idx_buffer.resize(gidx_buffer.size());
  dh::CopyDeviceSpanToVector(&idx_buffer, gidx_buffer);
  ba_.Clear();
  gidx_buffer = {};
  monitor_.StopCuda("CopyDeviceToHost");
 }
 // Return the memory cost for storing the compressed features.
-size_t EllpackPageImpl::MemCostBytes() const {
+size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
-  // Required buffer size for storing data matrix in ELLPack format.
+                                     const common::HistogramCuts& cuts) {
-  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
+  // Required buffer size for storing data matrix in EtoLLPack format.
-      matrix.info.row_stride * matrix.n_rows, matrix.info.NumSymbols());
+  size_t compressed_size_bytes =
      common::CompressedBufferWriter::CalculateBufferSize(row_stride * num_rows,
                                                          cuts.TotalBins() + 1);
  return compressed_size_bytes;
 }
-// Copy the compressed features to GPU.
+EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(int device) const {
-void EllpackPageImpl::InitDevice(int device, EllpackInfo info) {
+  gidx_buffer.SetDevice(device);
-  if (device_initialized_) return;
+  return EllpackDeviceAccessor(
      device, cuts_, is_dense, row_stride, base_rowid, n_rows,
      common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
                                           NumSymbols()));
 }
-  monitor_.StartCuda("CopyPageToDevice");
+EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
-  dh::safe_cuda(cudaSetDevice(device));
+                                 const SparsePage& page, bool is_dense,
-
+                                 size_t row_stride)
-  gidx_buffer = {};
+    : cuts_(std::move(cuts)),
-  ba_.Allocate(device, &gidx_buffer, idx_buffer.size());
+      is_dense(is_dense),
-  dh::CopyVectorToDeviceSpan(gidx_buffer, idx_buffer);
+      n_rows(page.Size()),
-
+      row_stride(row_stride) {
-  matrix.info = info;
+  this->InitCompressedData(device);
-  matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), info.n_bins + 1);
+  this->CreateHistIndices(device, page);
  monitor_.StopCuda("CopyPageToDevice");
  device_initialized_ = true;
 }
 }  // namespace xgboost
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@ -40,71 +40,53 @@ __forceinline__ __device__ int BinarySearchRow(
  return -1;
 }
-/** \brief Meta information about the ELLPACK matrix. */
+/** \brief Struct for accessing and manipulating an ellpack matrix on the
-struct EllpackInfo {
+ * device. Does not own underlying memory and may be trivially copied into
 * kernels.*/
 struct EllpackDeviceAccessor {
  /*! \brief Whether or not if the matrix is dense. */
  bool is_dense;
  /*! \brief Row length for ELLPack, equal to number of features. */
  size_t row_stride;
  /*! \brief Total number of bins, also used as the null index value, . */
  size_t n_bins;
  /*! \brief Minimum value for each feature. Size equals to number of features. */
  common::Span<bst_float> min_fvalue;
  /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
  common::Span<uint32_t> feature_segments;
  /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
  common::Span<bst_float> gidx_fvalue_map;
  EllpackInfo() = default;
  /*!
   * \brief Constructor.
   *
   * @param device The GPU device to use.
   * @param is_dense Whether the matrix is dense.
   * @param row_stride The number of features between starts of consecutive rows.
   * @param hmat The histogram cuts of all the features.
   * @param ba The BulkAllocator that owns the GPU memory.
   */
  explicit EllpackInfo(int device,
                       bool is_dense,
                       size_t row_stride,
                       const common::HistogramCuts& hmat,
                       dh::BulkAllocator* ba);
  /*! \brief Return the total number of symbols (total number of bins plus 1 for not found). */
  size_t NumSymbols() const {
    return n_bins + 1;
  }
  size_t NumFeatures() const {
    return min_fvalue.size();
  }
 };
 /** \brief Struct for accessing and manipulating an ellpack matrix on the
 * device. Does not own underlying memory and may be trivially copied into
 * kernels.*/
 struct EllpackMatrix {
  EllpackInfo info;
  size_t base_rowid{};
  size_t n_rows{};
  common::CompressedIterator<uint32_t> gidx_iter;
  /*! \brief Minimum value for each feature. Size equals to number of features. */
  common::Span<const bst_float> min_fvalue;
  /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
  common::Span<const uint32_t> feature_segments;
  /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
  common::Span<const bst_float> gidx_fvalue_map;
  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
                        bool is_dense, size_t row_stride, size_t base_rowid,
                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter)
      : is_dense(is_dense),
        row_stride(row_stride),
        base_rowid(base_rowid),
        n_rows(n_rows) ,gidx_iter(gidx_iter){
    cuts.cut_values_.SetDevice(device);
    cuts.cut_ptrs_.SetDevice(device);
    cuts.min_vals_.SetDevice(device);
    gidx_fvalue_map = cuts.cut_values_.ConstDeviceSpan();
    feature_segments = cuts.cut_ptrs_.ConstDeviceSpan();
    min_fvalue = cuts.min_vals_.ConstDeviceSpan();
  }
  // Get a matrix element, uses binary search for look up Return NaN if missing
  // Given a row index and a feature index, returns the corresponding cut value
  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
    ridx -= base_rowid;
-    auto row_begin = info.row_stride * ridx;
+    auto row_begin = row_stride * ridx;
-    auto row_end = row_begin + info.row_stride;
+    auto row_end = row_begin + row_stride;
    auto gidx = -1;
-    if (info.is_dense) {
+    if (is_dense) {
      gidx = gidx_iter[row_begin + fidx];
    } else {
      gidx = BinarySearchRow(row_begin,
                             row_end,
                             gidx_iter,
-                             info.feature_segments[fidx],
+                             feature_segments[fidx],
-                             info.feature_segments[fidx + 1]);
+                             feature_segments[fidx + 1]);
    }
    return gidx;
  }
@ -113,97 +95,27 @@ struct EllpackMatrix {
    if (gidx == -1) {
      return nan("");
    }
-    return info.gidx_fvalue_map[gidx];
+    return gidx_fvalue_map[gidx];
  }
  // Check if the row id is withing range of the current batch.
  __device__ bool IsInRange(size_t row_id) const {
    return row_id >= base_rowid && row_id < base_rowid + n_rows;
  }
  /*! \brief Return the total number of symbols (total number of bins plus 1 for
   * not found). */
  size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
  size_t NullValue() const { return gidx_fvalue_map.size(); }
  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };
 // Instances of this type are created while creating the histogram bins for the
 // entire dataset across multiple sparse page batches. This keeps track of the number
 // of rows to process from a batch and the position from which to process on each device.
 struct RowStateOnDevice {
  // Number of rows assigned to this device
  size_t total_rows_assigned_to_device;
  // Number of rows processed thus far
  size_t total_rows_processed;
  // Number of rows to process from the current sparse page batch
  size_t rows_to_process_from_batch;
  // Offset from the current sparse page batch to begin processing
  size_t row_offset_in_current_batch;
  explicit RowStateOnDevice(size_t total_rows)
      : total_rows_assigned_to_device(total_rows), total_rows_processed(0),
        rows_to_process_from_batch(0), row_offset_in_current_batch(0) {
  }
  explicit RowStateOnDevice(size_t total_rows, size_t batch_rows)
      : total_rows_assigned_to_device(total_rows), total_rows_processed(0),
        rows_to_process_from_batch(batch_rows), row_offset_in_current_batch(0) {
  }
  // Advance the row state by the number of rows processed
  void Advance() {
    total_rows_processed += rows_to_process_from_batch;
    CHECK_LE(total_rows_processed, total_rows_assigned_to_device);
    rows_to_process_from_batch = row_offset_in_current_batch = 0;
  }
 };
 // An instance of this type is created which keeps track of total number of rows to process,
 // rows processed thus far, rows to process and the offset from the current sparse page batch
 // to begin processing on each device
 class DeviceHistogramBuilderState {
 public:
  explicit DeviceHistogramBuilderState(size_t n_rows) : device_row_state_(n_rows) {}
  const RowStateOnDevice& GetRowStateOnDevice() const {
    return device_row_state_;
  }
  // This method is invoked at the beginning of each sparse page batch. This distributes
  // the rows in the sparse page to the device.
  // TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins.
  void BeginBatch(const SparsePage &batch) {
    size_t rem_rows = batch.Size();
    size_t row_offset_in_current_batch = 0;
    // Do we have anymore left to process from this batch on this device?
    if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) {
      // There are still some rows that needs to be assigned to this device
      device_row_state_.rows_to_process_from_batch =
          std::min(
              device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed,
              rem_rows);
    } else {
      // All rows have been assigned to this device
      device_row_state_.rows_to_process_from_batch = 0;
    }
    device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch;
    row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch;
    rem_rows -= device_row_state_.rows_to_process_from_batch;
  }
  // This method is invoked after completion of each sparse page batch
  void EndBatch() {
    device_row_state_.Advance();
  }
 private:
  RowStateOnDevice device_row_state_{0};
 };
 class EllpackPageImpl {
 public:
  EllpackMatrix matrix;
  /*! \brief global index of histogram, which is stored in ELLPack format. */
  common::Span<common::CompressedByteT> gidx_buffer;
  std::vector<common::CompressedByteT> idx_buffer;
  /*!
   * \brief Default constructor.
   *
@ -218,7 +130,12 @@ class EllpackPageImpl {
   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
   * and the given number of rows.
   */
-  explicit EllpackPageImpl(int device, EllpackInfo info, size_t n_rows);
+  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
                  size_t row_stride, size_t n_rows);
  EllpackPageImpl(int device, common::HistogramCuts cuts,
                  const SparsePage& page,
                  bool is_dense,size_t row_stride);
  /*!
   * \brief Constructor from an existing DMatrix.
@ -245,77 +162,53 @@ class EllpackPageImpl {
   */
  void Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes);
  /*!
   * \brief Initialize the EllpackInfo contained in the EllpackMatrix.
   *
   * This is used in the in-memory case. The current page owns the BulkAllocator, which in turn owns
   * the GPU memory used by the EllpackInfo.
   *
   * @param device The GPU device to use.
   * @param is_dense Whether the matrix is dense.
   * @param row_stride The number of features between starts of consecutive rows.
   * @param hmat The histogram cuts of all the features.
   */
  void InitInfo(int device, bool is_dense, size_t row_stride, const common::HistogramCuts& hmat);
  /*!
   * \brief Initialize the buffer to store compressed features.
   *
   * @param device The GPU device to use.
   * @param num_rows The number of rows we are storing in the buffer.
   */
  void InitCompressedData(int device, size_t num_rows);
  /*!
   * \brief Compress a single page of CSR data into ELLPACK.
   *
   * @param device The GPU device to use.
   * @param row_batch The CSR page.
   * @param device_row_state On-device data for maintaining state.
   */
  void CreateHistIndices(int device,
                         const SparsePage& row_batch,
                         const RowStateOnDevice& device_row_state);
  /*! \return Number of instances in the page. */
  size_t Size() const;
  /*! \brief Set the base row id for this page. */
-  inline void SetBaseRowId(size_t row_id) {
+  void SetBaseRowId(size_t row_id) {
-    matrix.base_rowid = row_id;
+    base_rowid = row_id;
  }
  /*! \brief clear the page. */
  void Clear();
  /*!
   * \brief Push a sparse page.
   * \param batch The row page.
   */
  void Push(int device, const SparsePage& batch);
  /*! \return Estimation of memory cost of this page. */
-  size_t MemCostBytes() const;
+  static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
  /*!
   * \brief Copy the ELLPACK matrix to GPU.
   *
   * @param device The GPU device to use.
   * @param info The EllpackInfo for the matrix.
   */
  void InitDevice(int device, EllpackInfo info);
-  /*! \brief Compress the accumulated SparsePage into ELLPACK format.
+  /*! \brief Return the total number of symbols (total number of bins plus 1 for
-   *
+   * not found). */
-   * @param device The GPU device to use.
+  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
-   */
+
-  void CompressSparsePage(int device);
+  EllpackDeviceAccessor GetDeviceAccessor(int device) const;
 private:
  /*!
   * \brief Compress a single page of CSR data into ELLPACK.
   *
   * @param device The GPU device to use.
   * @param row_batch The CSR page.
   */
  void CreateHistIndices(int device,
                         const SparsePage& row_batch
                         );
  /*!
   * \brief Initialize the buffer to store compressed features.
   */
  void InitCompressedData(int device);
 public:
  /*! \brief Whether or not if the matrix is dense. */
  bool is_dense;
  /*! \brief Row length for ELLPack. */
  size_t row_stride;
  size_t base_rowid{0};
  size_t n_rows{};
  /*! \brief global index of histogram, which is stored in ELLPack format. */
  HostDeviceVector<common::CompressedByteT> gidx_buffer;
  common::HistogramCuts cuts_;
 private:
  common::Monitor monitor_;
  dh::BulkAllocator ba_;
  bool device_initialized_{false};
  SparsePage sparse_page_{};
 };
 }  // namespace xgboost
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@ -17,26 +17,35 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
 public:
  bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
    auto* impl = page->Impl();
-    impl->Clear();
+    fi->Read(&impl->cuts_.cut_values_.HostVector());
-    if (!fi->Read(&impl->matrix.n_rows)) return false;
+    fi->Read(&impl->cuts_.cut_ptrs_.HostVector());
-    return fi->Read(&impl->idx_buffer);
+    fi->Read(&impl->cuts_.min_vals_.HostVector());
    fi->Read(&impl->n_rows);
    fi->Read(&impl->is_dense);
    fi->Read(&impl->row_stride);
    if (!fi->Read(&impl->gidx_buffer.HostVector())) {
      return false;
    }
    return true;
  }
  bool Read(EllpackPage* page,
            dmlc::SeekStream* fi,
            const std::vector<bst_uint>& sorted_index_set) override {
-    auto* impl = page->Impl();
+    LOG(FATAL) << "Not implemented";
-    impl->Clear();
+    return false;
    if (!fi->Read(&impl->matrix.n_rows))  return false;
    return fi->Read(&page->Impl()->idx_buffer);
  }
  void Write(const EllpackPage& page, dmlc::Stream* fo) override {
    auto* impl = page.Impl();
-    fo->Write(impl->matrix.n_rows);
+    fo->Write(impl->cuts_.cut_values_.ConstHostVector());
-    auto buffer = impl->idx_buffer;
+    fo->Write(impl->cuts_.cut_ptrs_.ConstHostVector());
-    CHECK(!buffer.empty());
+    fo->Write(impl->cuts_.min_vals_.ConstHostVector());
-    fo->Write(buffer);
+    fo->Write(impl->n_rows);
    fo->Write(impl->is_dense);
    fo->Write(impl->row_stride);
    CHECK(!impl->gidx_buffer.ConstHostVector().empty());
    fo->Write(impl->gidx_buffer.HostVector());
  }
 };
--- a/src/data/ellpack_page_source.cc
+++ b/src/data/ellpack_page_source.cc
@ -2,45 +2,23 @@
 * Copyright 2019 XGBoost contributors
 */
 #ifndef XGBOOST_USE_CUDA
 #include <dmlc/base.h>
 #if DMLC_ENABLE_STD_THREAD
 #include <xgboost/data.h>
 #include "ellpack_page_source.h"
 #include <xgboost/data.h>
 namespace xgboost {
 namespace data {
 EllpackPageSource::EllpackPageSource(DMatrix* dmat,
                                     const std::string& cache_info,
                                     const BatchParam& param) noexcept(false) {
-  LOG(FATAL) << "Internal Error: "
+  LOG(FATAL)
      << "Internal Error: "
         "XGBoost is not compiled with CUDA but EllpackPageSource is required";
 }
 void EllpackPageSource::BeforeFirst() {
  LOG(FATAL) << "Internal Error: "
                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
 }
 bool EllpackPageSource::Next() {
  LOG(FATAL) << "Internal Error: "
                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
  return false;
 }
 EllpackPage& EllpackPageSource::Value() {
  LOG(FATAL) << "Internal Error: "
                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
  EllpackPage* page { nullptr };
  return *page;
 }
 const EllpackPage& EllpackPageSource::Value() const {
  LOG(FATAL) << "Internal Error: "
                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
  EllpackPage* page { nullptr };
  return *page;
 }
 }  // namespace data
 }  // namespace xgboost
-
+#endif  // DMLC_ENABLE_STD_THREAD
 #endif  // XGBOOST_USE_CUDA
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@ -3,73 +3,16 @@
 */
 #include <memory>
 #include <utility>
 #include <vector>
 #include "../common/hist_util.h"
 #include "ellpack_page.cuh"
 #include "ellpack_page_source.h"
 #include "sparse_page_source.h"
 #include "ellpack_page.cuh"
 namespace xgboost {
 namespace data {
 class EllpackPageSourceImpl : public DataSource<EllpackPage> {
 public:
  /*!
   * \brief Create source from cache files the cache_prefix.
   * \param cache_prefix The prefix of cache we want to solve.
   */
  explicit EllpackPageSourceImpl(DMatrix* dmat,
                                 const std::string& cache_info,
                                 const BatchParam& param) noexcept(false);
  /*! \brief destructor */
  ~EllpackPageSourceImpl() override = default;
  void BeforeFirst() override;
  bool Next() override;
  EllpackPage& Value();
  const EllpackPage& Value() const override;
 private:
  /*! \brief Write Ellpack pages after accumulating them in memory. */
  void WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const;
  /*! \brief The page type string for ELLPACK. */
  const std::string kPageType_{".ellpack.page"};
  int device_{-1};
  size_t page_size_{DMatrix::kPageSize};
  common::Monitor monitor_;
  dh::BulkAllocator ba_;
  /*! \brief The EllpackInfo, with the underlying GPU memory shared by all pages. */
  EllpackInfo ellpack_info_;
  std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> source_;
  std::string cache_info_;
 };
 EllpackPageSource::EllpackPageSource(DMatrix* dmat,
                                     const std::string& cache_info,
                                     const BatchParam& param) noexcept(false)
    : impl_{new EllpackPageSourceImpl(dmat, cache_info, param)} {}
 void EllpackPageSource::BeforeFirst() {
  impl_->BeforeFirst();
 }
 bool EllpackPageSource::Next() {
  return impl_->Next();
 }
 EllpackPage& EllpackPageSource::Value() {
  return impl_->Value();
 }
 const EllpackPage& EllpackPageSource::Value() const {
  return impl_->Value();
 }
 size_t GetRowStride(DMatrix* dmat) {
  if (dmat->IsDense()) return dmat->Info().num_col_;
@ -86,17 +29,19 @@ size_t GetRowStride(DMatrix* dmat) {
 // Build the quantile sketch across the whole input data, then use the histogram cuts to compress
 // each CSR page, and write the accumulated ELLPACK pages to disk.
-EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
+EllpackPageSource::EllpackPageSource(DMatrix* dmat,
                                     const std::string& cache_info,
-                                             const BatchParam& param) noexcept(false)
+                                     const BatchParam& param) noexcept(false) {
-    : device_(param.gpu_id), cache_info_(cache_info) {
+  cache_info_ = ParseCacheInfo(cache_info, kPageType_);
-
+  for (auto file : cache_info_.name_shards) {
    CheckCacheFileExists(file);
  }
  if (param.gpu_page_size > 0) {
    page_size_ = param.gpu_page_size;
  }
  monitor_.Init("ellpack_page_source");
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(param.gpu_id));
  monitor_.StartCuda("Quantiles");
  size_t row_stride = GetRowStride(dmat);
@ -104,75 +49,52 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
                                   param.gpu_batch_nrows);
  monitor_.StopCuda("Quantiles");
  monitor_.StartCuda("CreateEllpackInfo");
  ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, cuts, &ba_);
  monitor_.StopCuda("CreateEllpackInfo");
  monitor_.StartCuda("WriteEllpackPages");
-  WriteEllpackPages(dmat, cache_info);
+  WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
  monitor_.StopCuda("WriteEllpackPages");
-  source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
+  external_prefetcher_.reset(
-      ParseCacheInfo(cache_info_, kPageType_)));
+      new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
 }
 void EllpackPageSourceImpl::BeforeFirst() {
  source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
      ParseCacheInfo(cache_info_, kPageType_)));
  source_->BeforeFirst();
 }
 bool EllpackPageSourceImpl::Next() {
  return source_->Next();
 }
 EllpackPage& EllpackPageSourceImpl::Value() {
  EllpackPage& page = source_->Value();
  page.Impl()->InitDevice(device_, ellpack_info_);
  return page;
 }
 const EllpackPage& EllpackPageSourceImpl::Value() const {
  EllpackPage& page = source_->Value();
  page.Impl()->InitDevice(device_, ellpack_info_);
  return page;
 }
 // Compress each CSR page to ELLPACK, and write the accumulated pages to disk.
-void EllpackPageSourceImpl::WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const {
+void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat,
                                          const common::HistogramCuts& cuts,
                                          const std::string& cache_info,
                                          size_t row_stride) const {
  auto cinfo = ParseCacheInfo(cache_info, kPageType_);
  const size_t extra_buffer_capacity = 6;
-  SparsePageWriter<EllpackPage> writer(
+  SparsePageWriter<EllpackPage> writer(cinfo.name_shards, cinfo.format_shards,
-      cinfo.name_shards, cinfo.format_shards, extra_buffer_capacity);
+                                       extra_buffer_capacity);
  std::shared_ptr<EllpackPage> page;
  SparsePage temp_host_page;
  writer.Alloc(&page);
  auto* impl = page->Impl();
  impl->matrix.info = ellpack_info_;
  impl->Clear();
  const MetaInfo& info = dmat->Info();
  size_t bytes_write = 0;
  double tstart = dmlc::GetTime();
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    impl->Push(device_, batch);
+    temp_host_page.Push(batch);
-    size_t mem_cost_bytes = impl->MemCostBytes();
+    size_t mem_cost_bytes =
        EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts);
    if (mem_cost_bytes >= page_size_) {
      bytes_write += mem_cost_bytes;
-      impl->CompressSparsePage(device_);
+      *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
                              row_stride);
      writer.PushWrite(std::move(page));
      writer.Alloc(&page);
      impl = page->Impl();
-      impl->matrix.info = ellpack_info_;
+      temp_host_page.Clear();
      impl->Clear();
      double tdiff = dmlc::GetTime() - tstart;
      LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
                << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
                << (bytes_write >> 20UL) << " written";
    }
  }
-  if (impl->Size() != 0) {
+  if (temp_host_page.Size() != 0) {
-    impl->CompressSparsePage(device_);
+    *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
                            row_stride);
    writer.PushWrite(std::move(page));
  }
 }
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@ -10,19 +10,17 @@
 #include <string>
 #include "../common/timer.h"
 #include "../common/hist_util.h"
 #include "sparse_page_source.h"
 namespace xgboost {
 namespace data {
 class EllpackPageSourceImpl;
 /*!
 * \brief External memory data source for ELLPACK format.
 *
 * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
 * including CUDA-specific implementation details in the header.
 */
-class EllpackPageSource : public DataSource<EllpackPage> {
+class EllpackPageSource {
 public:
  /*!
   * \brief Create source from cache files the cache_prefix.
@ -32,19 +30,33 @@ class EllpackPageSource : public DataSource<EllpackPage> {
                             const std::string& cache_info,
                             const BatchParam& param) noexcept(false);
-  /*! \brief destructor */
+  BatchSet<EllpackPage> GetBatchSet() {
-  ~EllpackPageSource() override = default;
+    auto begin_iter = BatchIterator<EllpackPage>(
        new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<EllpackPage>,
                                    EllpackPage>(external_prefetcher_.get()));
    return BatchSet<EllpackPage>(begin_iter);
  }
-  void BeforeFirst() override;
+  ~EllpackPageSource() {
-  bool Next() override;
+    external_prefetcher_.reset();
-  EllpackPage& Value();
+    for (auto file : cache_info_.name_shards) {
-  const EllpackPage& Value() const override;
+      TryDeleteCacheFile(file);
-
+    }
-  const EllpackPageSourceImpl* Impl() const { return impl_.get(); }
+  }
  EllpackPageSourceImpl* Impl() { return impl_.get(); }
 private:
-  std::shared_ptr<EllpackPageSourceImpl> impl_;
+  void WriteEllpackPages(int device, DMatrix* dmat,
                         const common::HistogramCuts& cuts,
                         const std::string& cache_info,
                         size_t row_stride) const;
  /*! \brief The page type string for ELLPACK. */
  const std::string kPageType_{".ellpack.page"};
  size_t page_size_{DMatrix::kPageSize};
  common::Monitor monitor_;
  std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> external_prefetcher_;
  CacheInfo cache_info_;
 };
 }  // namespace data
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@ -51,11 +51,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
    ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
    batch_param_ = param;
  }
-  ellpack_source_->BeforeFirst();
+  return ellpack_source_->GetBatchSet();
  ellpack_source_->Next();
  auto begin_iter = BatchIterator<EllpackPage>(
      new SparseBatchIteratorImpl<EllpackPageSource, EllpackPage>(ellpack_source_.get()));
  return BatchSet<EllpackPage>(begin_iter);
 }
 }  // namespace data
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -97,9 +97,11 @@ struct SparsePageLoader {
 };
 struct EllpackLoader {
-  EllpackMatrix const& matrix;
+  EllpackDeviceAccessor const& matrix;
-  XGBOOST_DEVICE EllpackLoader(EllpackMatrix const& m, bool use_shared, bst_feature_t num_features,
+  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool use_shared,
-                               bst_row_t num_rows, size_t entry_start) : matrix{m} {}
+                               bst_feature_t num_features, bst_row_t num_rows,
                               size_t entry_start)
      : matrix{m} {}
  __device__ __forceinline__ float GetFvalue(int ridx, int fidx) const {
    auto gidx = matrix.GetBinIndex(ridx, fidx);
    if (gidx == -1) {
@ -107,10 +109,10 @@ struct EllpackLoader {
    }
    // The gradient index needs to be shifted by one as min values are not included in the
    // cuts.
-    if (gidx == matrix.info.feature_segments[fidx]) {
+    if (gidx == matrix.feature_segments[fidx]) {
-      return matrix.info.min_fvalue[fidx];
+      return matrix.min_fvalue[fidx];
    }
-    return matrix.info.gidx_fvalue_map[gidx - 1];
+    return matrix.gidx_fvalue_map[gidx - 1];
  }
 };
@ -217,7 +219,7 @@ class GPUPredictor : public xgboost::Predictor {
        this->tree_begin_, this->tree_end_, num_features, num_rows,
        entry_start, use_shared, this->num_group_);
  }
-  void PredictInternal(EllpackMatrix const& batch, HostDeviceVector<bst_float>* out_preds,
+  void PredictInternal(EllpackDeviceAccessor const& batch, HostDeviceVector<bst_float>* out_preds,
                       size_t batch_offset) {
    const uint32_t BLOCK_THREADS = 256;
    size_t num_rows = batch.n_rows;
@ -226,11 +228,11 @@ class GPUPredictor : public xgboost::Predictor {
    bool use_shared = false;
    size_t entry_start = 0;
    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
-        PredictKernel<EllpackLoader, EllpackMatrix>,
+        PredictKernel<EllpackLoader, EllpackDeviceAccessor>,
        batch,
        dh::ToSpan(nodes_), out_preds->DeviceSpan().subspan(batch_offset),
        dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
-        this->tree_begin_, this->tree_end_, batch.info.NumFeatures(), num_rows,
+        this->tree_begin_, this->tree_end_, batch.NumFeatures(), num_rows,
        entry_start, use_shared, this->num_group_);
  }
@ -269,8 +271,10 @@ class GPUPredictor : public xgboost::Predictor {
    if (dmat->PageExists<EllpackPage>()) {
      size_t batch_offset = 0;
      for (auto const& page : dmat->GetBatches<EllpackPage>()) {
-        this->PredictInternal(page.Impl()->matrix, out_preds, batch_offset);
+        this->PredictInternal(
-        batch_offset += page.Impl()->matrix.n_rows;
+            page.Impl()->GetDeviceAccessor(generic_param_->gpu_id), out_preds,
            batch_offset);
        batch_offset += page.Impl()->n_rows;
      }
    } else {
      size_t batch_offset = 0;
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@ -153,7 +153,8 @@ ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page,
                                                   size_t n_rows,
                                                   const BatchParam& batch_param)
    : batch_param_(batch_param),
-      page_(new EllpackPageImpl(batch_param.gpu_id, page->matrix.info, n_rows)) {}
+      page_(new EllpackPageImpl(batch_param.gpu_id, page->cuts_, page->is_dense,
                                page->row_stride, n_rows)) {}
 GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair> gpair,
                                                     DMatrix* dmat) {
@ -217,9 +218,9 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(batch_param_.gpu_id,
+  page_.reset(new EllpackPageImpl(
-                                  original_page_->matrix.info,
+      batch_param_.gpu_id, original_page_->cuts_, original_page_->is_dense,
-                                  sample_rows));
+                                  original_page_->row_stride, sample_rows));
  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
@ -298,9 +299,9 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(batch_param_.gpu_id,
+  page_.reset(new EllpackPageImpl(batch_param_.gpu_id, original_page_->cuts_,
-                                  original_page_->matrix.info,
+                                  original_page_->is_dense,
-                                  sample_rows));
+                                  original_page_->row_stride, sample_rows));
  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
@ -319,7 +320,7 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
  monitor_.Init("gradient_based_sampler");
  bool is_sampling = subsample < 1.0;
-  bool is_external_memory = page->matrix.n_rows != n_rows;
+  bool is_external_memory = page->n_rows != n_rows;
  if (is_sampling) {
    switch (sampling_method) {
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@ -101,7 +101,7 @@ template GradientPairPrecise CreateRoundingFactor(common::Span<GradientPair cons
 template GradientPair CreateRoundingFactor(common::Span<GradientPair const> gpair);
 template <typename GradientSumT>
-__global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
+__global__ void SharedMemHistKernel(EllpackDeviceAccessor matrix,
                                    common::Span<const RowPartitioner::RowIndexT> d_ridx,
                                    GradientSumT* __restrict__ d_node_hist,
                                    const GradientPair* __restrict__ d_gpair,
@ -112,14 +112,14 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
  extern __shared__ char smem[];
  GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem);  // NOLINT
  if (use_shared_memory_histograms) {
-    dh::BlockFill(smem_arr, matrix.info.n_bins, GradientSumT());
+    dh::BlockFill(smem_arr, matrix.NumBins(), GradientSumT());
    __syncthreads();
  }
  for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
-    int ridx = d_ridx[idx / matrix.info.row_stride];
+    int ridx = d_ridx[idx / matrix.row_stride];
    int gidx =
-        matrix.gidx_iter[ridx * matrix.info.row_stride + idx % matrix.info.row_stride];
+        matrix.gidx_iter[ridx * matrix.row_stride + idx % matrix.row_stride];
-    if (gidx != matrix.info.n_bins) {
+    if (gidx != matrix.NumBins()) {
      GradientSumT truncated {
        TruncateWithRoundingFactor<T>(rounding.GetGrad(), d_gpair[ridx].GetGrad()),
        TruncateWithRoundingFactor<T>(rounding.GetHess(), d_gpair[ridx].GetHess()),
@ -135,7 +135,7 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
  if (use_shared_memory_histograms) {
    // Write shared memory back to global memory
    __syncthreads();
-    for (auto i : dh::BlockStrideRange(static_cast<size_t>(0), matrix.info.n_bins)) {
+    for (auto i : dh::BlockStrideRange(static_cast<size_t>(0), matrix.NumBins())) {
      GradientSumT truncated {
        TruncateWithRoundingFactor<T>(rounding.GetGrad(), smem_arr[i].GetGrad()),
        TruncateWithRoundingFactor<T>(rounding.GetHess(), smem_arr[i].GetHess()),
@ -146,16 +146,16 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
 }
 template <typename GradientSumT>
-void BuildGradientHistogram(EllpackMatrix const& matrix,
+void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
                            common::Span<GradientPair const> gpair,
                            common::Span<const uint32_t> d_ridx,
                            common::Span<GradientSumT> histogram,
                            GradientSumT rounding, bool shared) {
  const size_t smem_size =
      shared
-      ? sizeof(GradientSumT) * matrix.info.n_bins
+      ? sizeof(GradientSumT) * matrix.NumBins()
      : 0;
-  auto n_elements = d_ridx.size() * matrix.info.row_stride;
+  auto n_elements = d_ridx.size() * matrix.row_stride;
  uint32_t items_per_thread = 8;
  uint32_t block_threads = 256;
@ -168,14 +168,14 @@ void BuildGradientHistogram(EllpackMatrix const& matrix,
 }
 template void BuildGradientHistogram<GradientPair>(
-    EllpackMatrix const& matrix,
+    EllpackDeviceAccessor const& matrix,
    common::Span<GradientPair const> gpair,
    common::Span<const uint32_t> ridx,
    common::Span<GradientPair> histogram,
    GradientPair rounding, bool shared);
 template void BuildGradientHistogram<GradientPairPrecise>(
-    EllpackMatrix const& matrix,
+    EllpackDeviceAccessor const& matrix,
    common::Span<GradientPair const> gpair,
    common::Span<const uint32_t> ridx,
    common::Span<GradientPairPrecise> histogram,
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@ -18,7 +18,7 @@ DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x)
 }
 template <typename GradientSumT>
-void BuildGradientHistogram(EllpackMatrix const& matrix,
+void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
                            common::Span<GradientPair const> gpair,
                            common::Span<const uint32_t> ridx,
                            common::Span<GradientSumT> histogram,
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -180,15 +180,15 @@ template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
          typename MaxReduceT, typename TempStorageT, typename GradientSumT>
 __device__ void EvaluateFeature(
    int fidx, common::Span<const GradientSumT> node_histogram,
-    const xgboost::EllpackMatrix& matrix,
+    const EllpackDeviceAccessor& matrix,
    DeviceSplitCandidate* best_split,  // shared memory storing best split
    const DeviceNodeStats& node, const GPUTrainingParam& param,
    TempStorageT* temp_storage,  // temp memory for cub operations
    int constraint,              // monotonic_constraints
    const ValueConstraint& value_constraint) {
  // Use pointer from cut to indicate begin and end of bins for each feature.
-  uint32_t gidx_begin = matrix.info.feature_segments[fidx];  // begining bin
+  uint32_t gidx_begin = matrix.feature_segments[fidx];  // begining bin
-  uint32_t gidx_end = matrix.info.feature_segments[fidx + 1];  // end bin for i^th feature
+  uint32_t gidx_end = matrix.feature_segments[fidx + 1];  // end bin for i^th feature
  // Sum histogram bins for current feature
  GradientSumT const feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>(
@ -236,9 +236,9 @@ __device__ void EvaluateFeature(
      int split_gidx = (scan_begin + threadIdx.x) - 1;
      float fvalue;
      if (split_gidx < static_cast<int>(gidx_begin)) {
-        fvalue =  matrix.info.min_fvalue[fidx];
+        fvalue =  matrix.min_fvalue[fidx];
      } else {
-        fvalue = matrix.info.gidx_fvalue_map[split_gidx];
+        fvalue = matrix.gidx_fvalue_map[split_gidx];
      }
      GradientSumT left = missing_left ? bin + missing : bin;
      GradientSumT right = parent_sum - left;
@ -254,7 +254,7 @@ __global__ void EvaluateSplitKernel(
    common::Span<const GradientSumT> node_histogram,  // histogram for gradients
    common::Span<const bst_feature_t> feature_set,    // Selected features
    DeviceNodeStats node,
-    xgboost::EllpackMatrix matrix,
+    xgboost::EllpackDeviceAccessor matrix,
    GPUTrainingParam gpu_param,
    common::Span<DeviceSplitCandidate> split_candidates,  // resulting split
    ValueConstraint value_constraint,
@ -601,7 +601,7 @@ struct GPUHistMakerDevice {
      uint32_t constexpr kBlockThreads = 256;
      dh::LaunchKernel {uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]} (
          EvaluateSplitKernel<kBlockThreads, GradientSumT>,
-          hist.GetNodeHistogram(nidx), d_feature_set, node, page->matrix,
+          hist.GetNodeHistogram(nidx), d_feature_set, node, page->GetDeviceAccessor(device_id),
          gpu_param, d_split_candidates, node_value_constraints[nidx],
          monotone_constraints);
@ -625,9 +625,7 @@ struct GPUHistMakerDevice {
    hist.AllocateHistogram(nidx);
    auto d_node_hist = hist.GetNodeHistogram(nidx);
    auto d_ridx = row_partitioner->GetRows(nidx);
-    auto d_gpair = gpair.data();
+    BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist,
    BuildGradientHistogram(page->matrix, gpair, d_ridx, d_node_hist,
                           histogram_rounding, use_shared_memory_histograms);
  }
@ -637,7 +635,7 @@ struct GPUHistMakerDevice {
    auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
    auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
-    dh::LaunchN(device_id, page->matrix.info.n_bins, [=] __device__(size_t idx) {
+    dh::LaunchN(device_id, page->cuts_.TotalBins(), [=] __device__(size_t idx) {
      d_node_hist_subtraction[idx] =
          d_node_hist_parent[idx] - d_node_hist_histogram[idx];
    });
@ -652,7 +650,7 @@ struct GPUHistMakerDevice {
  }
  void UpdatePosition(int nidx, RegTree::Node split_node) {
-    auto d_matrix = page->matrix;
+    auto d_matrix = page->GetDeviceAccessor(device_id);
    row_partitioner->UpdatePosition(
        nidx, split_node.LeftChild(), split_node.RightChild(),
@ -689,7 +687,7 @@ struct GPUHistMakerDevice {
      row_partitioner.reset();  // Release the device memory first before reallocating
      row_partitioner.reset(new RowPartitioner(device_id, p_fmat->Info().num_row_));
    }
-    if (page->matrix.n_rows == p_fmat->Info().num_row_) {
+    if (page->n_rows == p_fmat->Info().num_row_) {
      FinalisePositionInPage(page, d_nodes);
    } else {
      for (auto& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
@ -699,7 +697,7 @@ struct GPUHistMakerDevice {
  }
  void FinalisePositionInPage(EllpackPageImpl* page, const common::Span<RegTree::Node> d_nodes) {
-    auto d_matrix = page->matrix;
+    auto d_matrix = page->GetDeviceAccessor(device_id);
    row_partitioner->FinalisePosition(
        [=] __device__(size_t row_id, int position) {
      if (!d_matrix.IsInRange(row_id)) {
@ -765,7 +763,7 @@ struct GPUHistMakerDevice {
    reducer->AllReduceSum(
        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-        page->matrix.info.n_bins * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
+        page->cuts_.TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
    reducer->Synchronize();
    monitor.StopCuda("AllReduce");
@ -954,14 +952,14 @@ inline void GPUHistMakerDevice<GradientSumT>::InitHistogram() {
  // check if we can use shared memory for building histograms
  // (assuming atleast we need 2 CTAs per SM to maintain decent latency
  // hiding)
-  auto histogram_size = sizeof(GradientSumT) * page->matrix.info.n_bins;
+  auto histogram_size = sizeof(GradientSumT) * page->cuts_.TotalBins();
  auto max_smem = dh::MaxSharedMemory(device_id);
  if (histogram_size <= max_smem) {
    use_shared_memory_histograms = true;
  }
  // Init histogram
-  hist.Init(device_id, page->matrix.info.n_bins);
+  hist.Init(device_id, page->cuts_.TotalBins());
 }
 template <typename GradientSumT>
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@ -19,23 +19,19 @@ TEST(EllpackPage, EmptyDMatrix) {
  auto dmat = *CreateDMatrix(kNRows, kNCols, kSparsity);
  auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin, kGpuBatchNRows}).begin();
  auto impl = page.Impl();
-  ASSERT_EQ(impl->matrix.info.feature_segments.size(), 1);
+  ASSERT_EQ(impl->row_stride, 0);
-  ASSERT_EQ(impl->matrix.info.min_fvalue.size(), 0);
+  ASSERT_EQ(impl->cuts_.TotalBins(), 0);
-  ASSERT_EQ(impl->matrix.info.gidx_fvalue_map.size(), 0);
+  ASSERT_EQ(impl->gidx_buffer.Size(), 4);
  ASSERT_EQ(impl->matrix.info.row_stride, 0);
  ASSERT_EQ(impl->matrix.info.n_bins, 0);
  ASSERT_EQ(impl->gidx_buffer.size(), 4);
 }
 TEST(EllpackPage, BuildGidxDense) {
  int constexpr kNRows = 16, kNCols = 8;
  auto page = BuildEllpackPage(kNRows, kNCols);
-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.size());
+  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
-  dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
+  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), page->NumSymbols());
  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
-  ASSERT_EQ(page->matrix.info.row_stride, kNCols);
+  ASSERT_EQ(page->row_stride, kNCols);
  std::vector<uint32_t> solution = {
    0, 3, 8,  9, 14, 17, 20, 21,
@ -64,11 +60,10 @@ TEST(EllpackPage, BuildGidxSparse) {
  int constexpr kNRows = 16, kNCols = 8;
  auto page = BuildEllpackPage(kNRows, kNCols, 0.9f);
-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.size());
+  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
  dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
-  ASSERT_LE(page->matrix.info.row_stride, 3);
+  ASSERT_LE(page->row_stride, 3);
  // row_stride = 3, 16 rows, 48 entries for ELLPack
  std::vector<uint32_t> solution = {
@ -76,16 +71,16 @@ TEST(EllpackPage, BuildGidxSparse) {
    24, 24, 24, 24, 24,  5, 24, 24,  0, 16, 24, 15, 24, 24, 24, 24,
    24,  7, 14, 16,  4, 24, 24, 24, 24, 24,  9, 24, 24,  1, 24, 24
  };
-  for (size_t i = 0; i < kNRows * page->matrix.info.row_stride; ++i) {
+  for (size_t i = 0; i < kNRows * page->row_stride; ++i) {
    ASSERT_EQ(solution[i], gidx[i]);
  }
 }
 struct ReadRowFunction {
-  EllpackMatrix matrix;
+  EllpackDeviceAccessor matrix;
  int row;
  bst_float* row_data_d;
-  ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d)
+  ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d)
      : matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
  __device__ void operator()(size_t col) {
@ -110,7 +105,8 @@ TEST(EllpackPage, Copy) {
  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
  // Create an empty result page.
-  EllpackPageImpl result(0, page->matrix.info, kRows);
+  EllpackPageImpl result(0, page->cuts_, page->is_dense, page->row_stride,
                         kRows);
  // Copy batch pages into the result page.
  size_t offset = 0;
@ -126,13 +122,13 @@ TEST(EllpackPage, Copy) {
  std::vector<bst_float> row_result(kCols);
  for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
    auto impl = page.Impl();
-    EXPECT_EQ(impl->matrix.base_rowid, current_row);
+    EXPECT_EQ(impl->base_rowid, current_row);
    for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
      thrust::copy(row_d.begin(), row_d.end(), row.begin());
-      dh::LaunchN(0, kCols, ReadRowFunction(result.matrix, current_row, row_result_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get()));
      thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
      EXPECT_EQ(row, row_result);
@ -155,7 +151,8 @@ TEST(EllpackPage, Compact) {
  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
  // Create an empty result page.
-  EllpackPageImpl result(0, page->matrix.info, kCompactedRows);
+  EllpackPageImpl result(0, page->cuts_, page->is_dense, page->row_stride,
                         kCompactedRows);
  // Compact batch pages into the result page.
  std::vector<size_t> row_indexes_h {
@ -174,7 +171,7 @@ TEST(EllpackPage, Compact) {
  std::vector<bst_float> row_result(kCols);
  for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
    auto impl = page.Impl();
-    EXPECT_EQ(impl->matrix.base_rowid, current_row);
+    EXPECT_EQ(impl->base_rowid, current_row);
    for (size_t i = 0; i < impl->Size(); i++) {
      size_t compacted_row = row_indexes_h[current_row];
@ -183,11 +180,12 @@ TEST(EllpackPage, Compact) {
        continue;
      }
-      dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
      dh::safe_cuda (cudaDeviceSynchronize());
      thrust::copy(row_d.begin(), row_d.end(), row.begin());
      dh::LaunchN(0, kCols,
-                  ReadRowFunction(result.matrix, compacted_row, row_result_d.data().get()));
+                  ReadRowFunction(result.GetDeviceAccessor(0), compacted_row, row_result_d.data().get()));
      thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
      EXPECT_EQ(row, row_result);
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@ -3,6 +3,7 @@
 #include <dmlc/filesystem.h>
 #include "../helpers.h"
 #include "../../../src/common/compressed_iterator.h"
 #include "../../../src/data/ellpack_page.cuh"
 namespace xgboost {
@ -58,31 +59,29 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
  BatchParam param{0, 2, 0, 0};
  auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
-  EXPECT_EQ(impl->matrix.base_rowid, 0);
+  EXPECT_EQ(impl->base_rowid, 0);
-  EXPECT_EQ(impl->matrix.n_rows, kRows);
+  EXPECT_EQ(impl->n_rows, kRows);
-  EXPECT_FALSE(impl->matrix.info.is_dense);
+  EXPECT_FALSE(impl->is_dense);
-  EXPECT_EQ(impl->matrix.info.row_stride, 2);
+  EXPECT_EQ(impl->row_stride, 2);
-  EXPECT_EQ(impl->matrix.info.n_bins, 4);
+  EXPECT_EQ(impl->cuts_.TotalBins(), 4);
  auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl();
-  EXPECT_EQ(impl_ext->matrix.base_rowid, 0);
+  EXPECT_EQ(impl_ext->base_rowid, 0);
-  EXPECT_EQ(impl_ext->matrix.n_rows, kRows);
+  EXPECT_EQ(impl_ext->n_rows, kRows);
-  EXPECT_FALSE(impl_ext->matrix.info.is_dense);
+  EXPECT_FALSE(impl_ext->is_dense);
-  EXPECT_EQ(impl_ext->matrix.info.row_stride, 2);
+  EXPECT_EQ(impl_ext->row_stride, 2);
-  EXPECT_EQ(impl_ext->matrix.info.n_bins, 4);
+  EXPECT_EQ(impl_ext->cuts_.TotalBins(), 4);
-  std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.size());
+  std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.HostVector());
-  std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.size());
+  std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.HostVector());
  dh::CopyDeviceSpanToVector(&buffer, impl->gidx_buffer);
  dh::CopyDeviceSpanToVector(&buffer_ext, impl_ext->gidx_buffer);
  EXPECT_EQ(buffer, buffer_ext);
 }
 struct ReadRowFunction {
-  EllpackMatrix matrix;
+  EllpackDeviceAccessor matrix;
  int row;
  bst_float* row_data_d;
-  ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d)
+  ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d)
      : matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}
  __device__ void operator()(size_t col) {
@ -110,8 +109,8 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
  BatchParam param{0, kMaxBins, 0, kPageSize};
  auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
-  EXPECT_EQ(impl->matrix.base_rowid, 0);
+  EXPECT_EQ(impl->base_rowid, 0);
-  EXPECT_EQ(impl->matrix.n_rows, kRows);
+  EXPECT_EQ(impl->n_rows, kRows);
  size_t current_row = 0;
  thrust::device_vector<bst_float> row_d(kCols);
@ -120,13 +119,13 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
  std::vector<bst_float> row_ext(kCols);
  for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
    auto impl_ext = page.Impl();
-    EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
+    EXPECT_EQ(impl_ext->base_rowid, current_row);
    for (size_t i = 0; i < impl_ext->Size(); i++) {
-      dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
      thrust::copy(row_d.begin(), row_d.end(), row.begin());
-      dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->matrix, current_row, row_ext_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get()));
      thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
      EXPECT_EQ(row, row_ext);
@ -155,8 +154,8 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
  size_t current_row = 0;
  for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
    auto impl_ext = page.Impl();
-    EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
+    EXPECT_EQ(impl_ext->base_rowid, current_row);
-    current_row += impl_ext->matrix.n_rows;
+    current_row += impl_ext->n_rows;
  }
 }
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -244,13 +244,13 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 public:
  using SuperT = common::HistogramCuts;
  void SetValues(std::vector<float> cuts) {
-    SuperT::cut_values_ = std::move(cuts);
+    SuperT::cut_values_.HostVector() = std::move(cuts);
  }
  void SetPtrs(std::vector<uint32_t> ptrs) {
-    SuperT::cut_ptrs_ = std::move(ptrs);
+    SuperT::cut_ptrs_.HostVector() = std::move(ptrs);
  }
  void SetMins(std::vector<float> mins) {
-    SuperT::min_vals_ = std::move(mins);
+    SuperT::min_vals_.HostVector() = std::move(mins);
  }
 };
 }  //  anonymous namespace
@ -279,10 +279,8 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
    row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
  }
-  auto page = std::unique_ptr<EllpackPageImpl>(new EllpackPageImpl(dmat->get(), {0, 256, 0}));
+  auto page = std::unique_ptr<EllpackPageImpl>(
-  page->InitInfo(0, (*dmat)->IsDense(), row_stride, cmat);
+      new EllpackPageImpl(0, cmat, batch, (*dmat)->IsDense(), row_stride));
  page->InitCompressedData(0, n_rows);
  page->CreateHistIndices(0, batch, RowStateOnDevice(batch.Size(), batch.Size()));
  delete dmat;
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@ -3,6 +3,7 @@
 #include "../../../../src/data/ellpack_page.cuh"
 #include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
 #include "../../helpers.h"
 #include "dmlc/filesystem.h"
 namespace xgboost {
 namespace tree {
@ -29,7 +30,7 @@ void VerifySampling(size_t page_size,
  BatchParam param{0, 256, 0, page_size};
  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
  if (page_size != 0) {
-    EXPECT_NE(page->matrix.n_rows, kRows);
+    EXPECT_NE(page->n_rows, kRows);
  }
  GradientBasedSampler sampler(page, kRows, param, subsample, sampling_method);
@ -37,11 +38,11 @@ void VerifySampling(size_t page_size,
  if (fixed_size_sampling) {
    EXPECT_EQ(sample.sample_rows, kRows);
-    EXPECT_EQ(sample.page->matrix.n_rows, kRows);
+    EXPECT_EQ(sample.page->n_rows, kRows);
    EXPECT_EQ(sample.gpair.size(), kRows);
  } else {
-    EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.016f);
+    EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.016);
-    EXPECT_NEAR(sample.page->matrix.n_rows, sample_rows, kRows * 0.016f);
+    EXPECT_NEAR(sample.page->n_rows, sample_rows, kRows * 0.016f);
    EXPECT_NEAR(sample.gpair.size(), sample_rows, kRows * 0.016f);
  }
@ -83,7 +84,7 @@ TEST(GradientBasedSampler, NoSampling_ExternalMemory) {
  BatchParam param{0, 256, 0, kPageSize};
  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
-  EXPECT_NE(page->matrix.n_rows, kRows);
+  EXPECT_NE(page->n_rows, kRows);
  GradientBasedSampler sampler(page, kRows, param, kSubsample, TrainParam::kUniform);
  auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get());
@ -91,21 +92,19 @@ TEST(GradientBasedSampler, NoSampling_ExternalMemory) {
  EXPECT_EQ(sample.sample_rows, kRows);
  EXPECT_EQ(sample.gpair.size(), gpair.Size());
  EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
-  EXPECT_EQ(sampled_page->matrix.n_rows, kRows);
+  EXPECT_EQ(sampled_page->n_rows, kRows);
-  std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.size());
+  std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.HostVector());
  dh::CopyDeviceSpanToVector(&buffer, sampled_page->gidx_buffer);
  common::CompressedIterator<common::CompressedByteT>
-      ci(buffer.data(), sampled_page->matrix.info.NumSymbols());
+      ci(buffer.data(), sampled_page->NumSymbols());
  size_t offset = 0;
  for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
    auto page = batch.Impl();
-    std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.size());
+    std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.HostVector());
    dh::CopyDeviceSpanToVector(&page_buffer, page->gidx_buffer);
    common::CompressedIterator<common::CompressedByteT>
-        page_ci(page_buffer.data(), page->matrix.info.NumSymbols());
+        page_ci(page_buffer.data(), page->NumSymbols());
-    size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
+    size_t num_elements = page->n_rows * page->row_stride;
    for (size_t i = 0; i < num_elements; i++) {
      EXPECT_EQ(ci[i + offset], page_ci[i]);
    }
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@ -27,7 +27,7 @@ void TestDeterminsticHistogram() {
    gpair.SetDevice(0);
    auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
-    BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
+    BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
                           d_histogram, rounding, true);
    for (size_t i = 0; i < kRounds; ++i) {
@ -35,7 +35,7 @@ void TestDeterminsticHistogram() {
      auto d_histogram = dh::ToSpan(new_histogram);
      auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
-      BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
                             d_histogram, rounding, true);
      for (size_t j = 0; j < new_histogram.size(); ++j) {
@ -50,7 +50,7 @@ void TestDeterminsticHistogram() {
      auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
      gpair.SetDevice(0);
      dh::device_vector<Gradient> baseline(kBins * kCols);
-      BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
                             dh::ToSpan(baseline), rounding, true);
      for (size_t i = 0; i < baseline.size(); ++i) {
        EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@ -97,12 +97,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  }
  gpair.SetDevice(0);
-  thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.size());
+  thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
  common::CompressedByteT* d_gidx_buffer_ptr = page->gidx_buffer.data();
  dh::safe_cuda(cudaMemcpy(h_gidx_buffer.data(), d_gidx_buffer_ptr,
                           sizeof(common::CompressedByteT) * page->gidx_buffer.size(),
                           cudaMemcpyDeviceToHost));
  maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
  maker.hist.AllocateHistogram(0);
@ -196,15 +192,10 @@ TEST(GpuHist, EvaluateSplits) {
  auto cmat = GetHostCutMatrix();
  // Copy cut matrix to device.
-  maker.ba.Allocate(0,
+  page->cuts_ = cmat;
-                    &(page->matrix.info.feature_segments), cmat.Ptrs().size(),
+  maker.ba.Allocate(0, &(maker.monotone_constraints), kNCols);
-                    &(page->matrix.info.min_fvalue), cmat.MinValues().size(),
+  dh::CopyVectorToDeviceSpan(maker.monotone_constraints,
-                    &(page->matrix.info.gidx_fvalue_map), 24,
+                             param.monotone_constraints);
                    &(maker.monotone_constraints), kNCols);
  dh::CopyVectorToDeviceSpan(page->matrix.info.feature_segments, cmat.Ptrs());
  dh::CopyVectorToDeviceSpan(page->matrix.info.gidx_fvalue_map, cmat.Values());
  dh::CopyVectorToDeviceSpan(maker.monotone_constraints, param.monotone_constraints);
  dh::CopyVectorToDeviceSpan(page->matrix.info.min_fvalue, cmat.MinValues());
  // Initialize GPUHistMakerDevice::hist
  maker.hist.Init(0, (max_bins - 1) * kNCols);
@ -274,15 +265,13 @@ void TestHistogramIndexImpl() {
  // Extract the device maker from the histogram makers and from that its compressed
  // histogram index
  const auto &maker = hist_maker.maker;
-  std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.size());
+  std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
  dh::CopyDeviceSpanToVector(&h_gidx_buffer, maker->page->gidx_buffer);
  const auto &maker_ext = hist_maker_ext.maker;
-  std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.size());
+  std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());
  dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, maker_ext->page->gidx_buffer);
-  ASSERT_EQ(maker->page->matrix.info.n_bins, maker_ext->page->matrix.info.n_bins);
+  ASSERT_EQ(maker->page->cuts_.TotalBins(), maker_ext->page->cuts_.TotalBins());
-  ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size());
+  ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size());
 }