Partial rewrite EllpackPage (#5352)

2020-03-11 10:15:53 +13:00
parent 7a99f8f27f
commit 3ad4333b0e
23 changed files with 496 additions and 733 deletions
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -181,13 +181,13 @@ class CompressedIterator {
  typedef value_type reference;             // NOLINT

 private:
-  CompressedByteT *buffer_;
+  const CompressedByteT *buffer_;
  size_t symbol_bits_;
  size_t offset_;

 public:
  CompressedIterator() : buffer_(nullptr), symbol_bits_(0), offset_(0) {}
-  CompressedIterator(CompressedByteT *buffer, size_t num_symbols)
+  CompressedIterator(const CompressedByteT *buffer, size_t num_symbols)
      : buffer_(buffer), offset_(0) {
    symbol_bits_ = detail::SymbolBits(num_symbols);
  }
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -31,7 +31,7 @@ namespace common {

 HistogramCuts::HistogramCuts() {
  monitor_.Init(__FUNCTION__);
-  cut_ptrs_.emplace_back(0);
+  cut_ptrs_.HostVector().emplace_back(0);
 }

 // Dispatch to specific builder.
@@ -52,7 +52,7 @@ void HistogramCuts::Build(DMatrix* dmat, uint32_t const max_num_bins) {
    DenseCuts cuts(this);
    cuts.Build(dmat, max_num_bins);
  }
-  LOG(INFO) << "Total number of hist bins: " << cut_ptrs_.back();
+  LOG(INFO) << "Total number of hist bins: " << cut_ptrs_.HostVector().back();
 }

 bool CutsBuilder::UseGroup(DMatrix* dmat) {
@@ -75,7 +75,10 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,

  // Data groups, used in ranking.
  std::vector<bst_uint> const& group_ptr = info.group_ptr_;
-  p_cuts_->min_vals_.resize(end_col - beg_col, 0);
+  auto &local_min_vals = p_cuts_->min_vals_.HostVector();
+  auto &local_cuts = p_cuts_->cut_values_.HostVector();
+  auto &local_ptrs = p_cuts_->cut_ptrs_.HostVector();
+  local_min_vals.resize(end_col - beg_col, 0);

  for (uint32_t col_id = beg_col; col_id < page.Size() && col_id < end_col; ++col_id) {
    // Using a local variable makes things easier, but at the cost of memory trashing.
@@ -85,7 +88,7 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
                                     max_num_bins);
    if (n_bins == 0) {
      // cut_ptrs_ is initialized with a zero, so there's always an element at the back
-      p_cuts_->cut_ptrs_.emplace_back(p_cuts_->cut_ptrs_.back());
+      local_ptrs.emplace_back(local_ptrs.back());
      continue;
    }

@@ -112,17 +115,17 @@ void SparseCuts::SingleThreadBuild(SparsePage const& page, MetaInfo const& info,
    // Can be use data[1] as the min values so that we don't need to
    // store another array?
    float mval = summary.data[0].value;
-    p_cuts_->min_vals_[col_id - beg_col]  = mval - (fabs(mval) + 1e-5);
+    local_min_vals[col_id - beg_col]  = mval - (fabs(mval) + 1e-5);

    this->AddCutPoint(summary, max_num_bins);

    bst_float cpt = (summary.size > 0) ?
                    summary.data[summary.size - 1].value :
-                    p_cuts_->min_vals_[col_id - beg_col];
+                    local_min_vals[col_id - beg_col];
    cpt += fabs(cpt) + 1e-5;
-    p_cuts_->cut_values_.emplace_back(cpt);
+    local_cuts.emplace_back(cpt);

-    p_cuts_->cut_ptrs_.emplace_back(p_cuts_->cut_values_.size());
+    local_ptrs.emplace_back(local_cuts.size());
  }
 }

@@ -196,33 +199,40 @@ void SparseCuts::Concat(
    std::vector<std::unique_ptr<SparseCuts>> const& cuts, uint32_t n_cols) {
  monitor_.Start(__FUNCTION__);
  uint32_t nthreads = omp_get_max_threads();
-  p_cuts_->min_vals_.resize(n_cols, std::numeric_limits<float>::max());
+  auto &local_min_vals = p_cuts_->min_vals_.HostVector();
+  auto &local_cuts = p_cuts_->cut_values_.HostVector();
+  auto &local_ptrs = p_cuts_->cut_ptrs_.HostVector();
+  local_min_vals.resize(n_cols, std::numeric_limits<float>::max());
  size_t min_vals_tail = 0;

  for (uint32_t t = 0; t < nthreads; ++t) {
+    auto& thread_min_vals = cuts[t]->p_cuts_->min_vals_.HostVector();
+    auto& thread_cuts = cuts[t]->p_cuts_->cut_values_.HostVector();
+    auto& thread_ptrs = cuts[t]->p_cuts_->cut_ptrs_.HostVector();
+
    // concat csc pointers.
-    size_t const old_ptr_size = p_cuts_->cut_ptrs_.size();
-    p_cuts_->cut_ptrs_.resize(
-        cuts[t]->p_cuts_->cut_ptrs_.size() + p_cuts_->cut_ptrs_.size() - 1);
-    size_t const new_icp_size = p_cuts_->cut_ptrs_.size();
-    auto tail = p_cuts_->cut_ptrs_[old_ptr_size-1];
+    size_t const old_ptr_size = local_ptrs.size();
+    local_ptrs.resize(
+        thread_ptrs.size() + local_ptrs.size() - 1);
+    size_t const new_icp_size = local_ptrs.size();
+    auto tail = local_ptrs[old_ptr_size-1];
    for (size_t j = old_ptr_size; j < new_icp_size; ++j) {
-      p_cuts_->cut_ptrs_[j] = tail + cuts[t]->p_cuts_->cut_ptrs_[j-old_ptr_size+1];
+      local_ptrs[j] = tail + thread_ptrs[j-old_ptr_size+1];
    }
    // concat csc values
-    size_t const old_iv_size = p_cuts_->cut_values_.size();
-    p_cuts_->cut_values_.resize(
-        cuts[t]->p_cuts_->cut_values_.size() + p_cuts_->cut_values_.size());
-    size_t const new_iv_size = p_cuts_->cut_values_.size();
+    size_t const old_iv_size = local_cuts.size();
+    local_cuts.resize(
+        thread_cuts.size() + local_cuts.size());
+    size_t const new_iv_size = local_cuts.size();
    for (size_t j = old_iv_size; j < new_iv_size; ++j) {
-      p_cuts_->cut_values_[j] = cuts[t]->p_cuts_->cut_values_[j-old_iv_size];
+      local_cuts[j] = thread_cuts[j-old_iv_size];
    }
    // merge min values
-    for (size_t j = 0; j < cuts[t]->p_cuts_->min_vals_.size(); ++j) {
-      p_cuts_->min_vals_.at(min_vals_tail + j) =
-          std::min(p_cuts_->min_vals_.at(min_vals_tail + j), cuts.at(t)->p_cuts_->min_vals_.at(j));
+    for (size_t j = 0; j < thread_min_vals.size(); ++j) {
+       local_min_vals.at(min_vals_tail + j) =
+          std::min(local_min_vals.at(min_vals_tail + j), thread_min_vals.at(j));
    }
-    min_vals_tail += cuts[t]->p_cuts_->min_vals_.size();
+    min_vals_tail += thread_min_vals.size();
  }
  monitor_.Stop(__FUNCTION__);
 }
@@ -323,27 +333,27 @@ void DenseCuts::Init
  // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
  // we need to move this allreduce before loadcheckpoint call in future
  sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
-  p_cuts_->min_vals_.resize(sketchs.size());
+  p_cuts_->min_vals_.HostVector().resize(sketchs.size());

  for (size_t fid = 0; fid < summary_array.size(); ++fid) {
    WQSketch::SummaryContainer a;
    a.Reserve(max_num_bins + 1);
    a.SetPrune(summary_array[fid], max_num_bins + 1);
    const bst_float mval = a.data[0].value;
-    p_cuts_->min_vals_[fid] = mval - (fabs(mval) + 1e-5);
+    p_cuts_->min_vals_.HostVector()[fid] = mval - (fabs(mval) + 1e-5);
    AddCutPoint(a, max_num_bins);
    // push a value that is greater than anything
    const bst_float cpt
-      = (a.size > 0) ? a.data[a.size - 1].value : p_cuts_->min_vals_[fid];
+      = (a.size > 0) ? a.data[a.size - 1].value : p_cuts_->min_vals_.HostVector()[fid];
    // this must be bigger than last value in a scale
    const bst_float last = cpt + (fabs(cpt) + 1e-5);
-    p_cuts_->cut_values_.push_back(last);
+    p_cuts_->cut_values_.HostVector().push_back(last);

    // Ensure that every feature gets at least one quantile point
-    CHECK_LE(p_cuts_->cut_values_.size(), std::numeric_limits<uint32_t>::max());
-    auto cut_size = static_cast<uint32_t>(p_cuts_->cut_values_.size());
-    CHECK_GT(cut_size, p_cuts_->cut_ptrs_.back());
-    p_cuts_->cut_ptrs_.push_back(cut_size);
+    CHECK_LE(p_cuts_->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
+    auto cut_size = static_cast<uint32_t>(p_cuts_->cut_values_.HostVector().size());
+    CHECK_GT(cut_size, p_cuts_->cut_ptrs_.HostVector().back());
+    p_cuts_->cut_ptrs_.HostVector().push_back(cut_size);
  }
  monitor_.Stop(__func__);
 }
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -44,17 +44,35 @@ class HistogramCuts {
  using BinIdx = uint32_t;
  common::Monitor monitor_;

-  std::vector<bst_float> cut_values_;
-  std::vector<uint32_t> cut_ptrs_;
-  std::vector<float> min_vals_;  // storing minimum value in a sketch set.
-
 public:
+  HostDeviceVector<bst_float> cut_values_;
+  HostDeviceVector<uint32_t> cut_ptrs_;
+  HostDeviceVector<float> min_vals_;  // storing minimum value in a sketch set.
+
  HistogramCuts();
-  HistogramCuts(HistogramCuts const& that) = delete;
+  HistogramCuts(HistogramCuts const& that) {
+    cut_values_.Resize(that.cut_values_.Size());
+    cut_ptrs_.Resize(that.cut_ptrs_.Size());
+    min_vals_.Resize(that.min_vals_.Size());
+    cut_values_.Copy(that.cut_values_);
+    cut_ptrs_.Copy(that.cut_ptrs_);
+    min_vals_.Copy(that.min_vals_);
+  }
+
  HistogramCuts(HistogramCuts&& that) noexcept(true) {
    *this = std::forward<HistogramCuts&&>(that);
  }
-  HistogramCuts& operator=(HistogramCuts const& that) = delete;
+
+  HistogramCuts& operator=(HistogramCuts const& that) {
+    cut_values_.Resize(that.cut_values_.Size());
+    cut_ptrs_.Resize(that.cut_ptrs_.Size());
+    min_vals_.Resize(that.min_vals_.Size());
+    cut_values_.Copy(that.cut_values_);
+    cut_ptrs_.Copy(that.cut_ptrs_);
+    min_vals_.Copy(that.min_vals_);
+    return *this;
+  }
+
  HistogramCuts& operator=(HistogramCuts&& that) noexcept(true) {
    monitor_ = std::move(that.monitor_);
    cut_ptrs_ = std::move(that.cut_ptrs_);
@@ -67,28 +85,30 @@ class HistogramCuts {
  void Build(DMatrix* dmat, uint32_t const max_num_bins);
  /* \brief How many bins a feature has. */
  uint32_t FeatureBins(uint32_t feature) const {
-    return cut_ptrs_.at(feature+1) - cut_ptrs_[feature];
+    return cut_ptrs_.ConstHostVector().at(feature + 1) -
+           cut_ptrs_.ConstHostVector()[feature];
  }

  // Getters.  Cuts should be of no use after building histogram indices, but currently
  // it's deeply linked with quantile_hist, gpu sketcher and gpu_hist.  So we preserve
  // these for now.
-  std::vector<uint32_t> const& Ptrs()      const { return cut_ptrs_;   }
-  std::vector<float>    const& Values()    const { return cut_values_; }
-  std::vector<float>    const& MinValues() const { return min_vals_;   }
+  std::vector<uint32_t> const& Ptrs()      const { return cut_ptrs_.ConstHostVector();   }
+  std::vector<float>    const& Values()    const { return cut_values_.ConstHostVector(); }
+  std::vector<float>    const& MinValues() const { return min_vals_.ConstHostVector();   }

-  size_t TotalBins() const { return cut_ptrs_.back(); }
+  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }

  // Return the index of a cut point that is strictly greater than the input
  // value, or the last available index if none exists
  BinIdx SearchBin(float value, uint32_t column_id) const {
-    auto beg = cut_ptrs_.at(column_id);
-    auto end = cut_ptrs_.at(column_id + 1);
-    auto it = std::upper_bound(cut_values_.cbegin() + beg, cut_values_.cbegin() + end, value);
-    if (it == cut_values_.cend()) {
-      it = cut_values_.cend() - 1;
+    auto beg = cut_ptrs_.ConstHostVector().at(column_id);
+    auto end = cut_ptrs_.ConstHostVector().at(column_id + 1);
+    const auto &values = cut_values_.ConstHostVector();
+    auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
+    if (it == values.cend()) {
+      it = values.cend() - 1;
    }
-    BinIdx idx = it - cut_values_.cbegin();
+    BinIdx idx = it - values.cbegin();
    return idx;
  }

@@ -133,8 +153,8 @@ class CutsBuilder {
    size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
    for (size_t i = 1; i < required_cuts; ++i) {
      bst_float cpt = summary.data[i].value;
-      if (i == 1 || cpt > p_cuts_->cut_values_.back()) {
-        p_cuts_->cut_values_.push_back(cpt);
+      if (i == 1 || cpt > p_cuts_->cut_values_.ConstHostVector().back()) {
+        p_cuts_->cut_values_.HostVector().push_back(cpt);
      }
    }
  }
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -371,6 +371,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
+template class HostDeviceVector<uint8_t>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -13,11 +13,24 @@ class EllpackPageImpl {};
 EllpackPage::EllpackPage() = default;

 EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param) {
-  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
 }

 EllpackPage::~EllpackPage() {
-  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but EllpackPage is required";
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+}
+
+void EllpackPage::SetBaseRowId(size_t row_id) {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+}
+
+size_t EllpackPage::Size() const {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+  return 0;
 }

 }  // namespace xgboost
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -4,9 +4,9 @@

 #include <xgboost/data.h>

-#include "./ellpack_page.cuh"
 #include "../common/hist_util.h"
 #include "../common/random.h"
+#include "./ellpack_page.cuh"

 namespace xgboost {

@@ -17,13 +17,9 @@ EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param)

 EllpackPage::~EllpackPage() = default;

-size_t EllpackPage::Size() const {
-  return impl_->Size();
-}
+size_t EllpackPage::Size() const { return impl_->Size(); }

-void EllpackPage::SetBaseRowId(size_t row_id) {
-  impl_->SetBaseRowId(row_id);
-}
+void EllpackPage::SetBaseRowId(size_t row_id) { impl_->SetBaseRowId(row_id); }

 // Bin each input data entry, store the bin indices in compressed form.
 __global__ void CompressBinEllpackKernel(
@@ -65,16 +61,18 @@ __global__ void CompressBinEllpackKernel(
 }

 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, EllpackInfo info, size_t n_rows) {
+EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+                                 bool is_dense, size_t row_stride,
+                                 size_t n_rows)
+    : is_dense(is_dense),
+      cuts_(std::move(cuts)),
+      row_stride(row_stride),
+      n_rows(n_rows) {
  monitor_.Init("ellpack_page");
  dh::safe_cuda(cudaSetDevice(device));

-  matrix.info = info;
-  matrix.base_rowid = 0;
-  matrix.n_rows = n_rows;
-
  monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(device, n_rows);
+  InitCompressedData(device);
  monitor_.StopCuda("InitCompressedData");
 }

@@ -93,33 +91,27 @@ size_t GetRowStride(DMatrix* dmat) {
 }

 // Construct an ELLPACK matrix in memory.
-EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param) {
+EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
+    : is_dense(dmat->IsDense()) {
  monitor_.Init("ellpack_page");
  dh::safe_cuda(cudaSetDevice(param.gpu_id));

-  matrix.n_rows = dmat->Info().num_row_;
+  n_rows = dmat->Info().num_row_;

  monitor_.StartCuda("Quantiles");
  // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
-  size_t row_stride = GetRowStride(dmat);
-  auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
+  row_stride = GetRowStride(dmat);
+  cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin,
                                   param.gpu_batch_nrows);
  monitor_.StopCuda("Quantiles");

-  monitor_.StartCuda("InitEllpackInfo");
-  InitInfo(param.gpu_id, dmat->IsDense(), row_stride, cuts);
-  monitor_.StopCuda("InitEllpackInfo");
-
  monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(param.gpu_id, dmat->Info().num_row_);
+  InitCompressedData(param.gpu_id);
  monitor_.StopCuda("InitCompressedData");

  monitor_.StartCuda("BinningCompression");
-  DeviceHistogramBuilderState hist_builder_row_state(dmat->Info().num_row_);
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    hist_builder_row_state.BeginBatch(batch);
-    CreateHistIndices(param.gpu_id, batch, hist_builder_row_state.GetRowStateOnDevice());
-    hist_builder_row_state.EndBatch();
+    CreateHistIndices(param.gpu_id, batch);
  }
  monitor_.StopCuda("BinningCompression");
 }
@@ -133,23 +125,26 @@ struct CopyPage {
  size_t offset;

  CopyPage(EllpackPageImpl* dst, EllpackPageImpl* src, size_t offset)
-      : cbw{dst->matrix.info.NumSymbols()},
-        dst_data_d{dst->gidx_buffer.data()},
-        src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
+      : cbw{dst->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.DevicePointer()},
+        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
        offset(offset) {}

  __device__ void operator()(size_t element_id) {
-    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id], element_id + offset);
+    cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[element_id],
+                          element_id + offset);
  }
 };

 // Copy the data from the given EllpackPage to the current page.
 size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
  monitor_.StartCuda("Copy");
-  size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
-  CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
-  CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
-  CHECK_GE(matrix.n_rows * matrix.info.row_stride, offset + num_elements);
+  size_t num_elements = page->n_rows * page->row_stride;
+  CHECK_EQ(row_stride, page->row_stride);
+  CHECK_EQ(NumSymbols(), page->NumSymbols());
+  CHECK_GE(n_rows * row_stride, offset + num_elements);
+  gidx_buffer.SetDevice(device);
+  page->gidx_buffer.SetDevice(device);
  dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
  monitor_.StopCuda("Copy");
  return num_elements;
@@ -160,26 +155,29 @@ struct CompactPage {
  common::CompressedBufferWriter cbw;
  common::CompressedByteT* dst_data_d;
  common::CompressedIterator<uint32_t> src_iterator_d;
-  /*! \brief An array that maps the rows from the full DMatrix to the compacted page.
+  /*! \brief An array that maps the rows from the full DMatrix to the compacted
+   * page.
   *
-   * The total size is the number of rows in the original, uncompacted DMatrix. Elements are the
-   * row ids in the compacted page. Rows not needed are set to SIZE_MAX.
+   * The total size is the number of rows in the original, uncompacted DMatrix.
+   * Elements are the row ids in the compacted page. Rows not needed are set to
+   * SIZE_MAX.
   *
   * An example compacting 16 rows to 8 rows:
-   * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6, SIZE_MAX, 7, SIZE_MAX,
-   * SIZE_MAX]
+   * [SIZE_MAX, 0, 1, SIZE_MAX, SIZE_MAX, 2, SIZE_MAX, 3, 4, 5, SIZE_MAX, 6,
+   * SIZE_MAX, 7, SIZE_MAX, SIZE_MAX]
   */
  common::Span<size_t> row_indexes;
  size_t base_rowid;
  size_t row_stride;

-  CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src, common::Span<size_t> row_indexes)
-      : cbw{dst->matrix.info.NumSymbols()},
-        dst_data_d{dst->gidx_buffer.data()},
-        src_iterator_d{src->gidx_buffer.data(), src->matrix.info.NumSymbols()},
+  CompactPage(EllpackPageImpl* dst, EllpackPageImpl* src,
+              common::Span<size_t> row_indexes)
+      : cbw{dst->NumSymbols()},
+        dst_data_d{dst->gidx_buffer.DevicePointer()},
+        src_iterator_d{src->gidx_buffer.DevicePointer(), src->NumSymbols()},
        row_indexes(row_indexes),
-        base_rowid{src->matrix.base_rowid},
-        row_stride{src->matrix.info.row_stride} {}
+        base_rowid{src->base_rowid},
+        row_stride{src->row_stride} {}

  __device__ void operator()(size_t row_id) {
    size_t src_row = base_rowid + row_id;
@@ -188,100 +186,72 @@ struct CompactPage {
    size_t dst_offset = dst_row * row_stride;
    size_t src_offset = row_id * row_stride;
    for (size_t j = 0; j < row_stride; j++) {
-      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j], dst_offset + j);
+      cbw.AtomicWriteSymbol(dst_data_d, src_iterator_d[src_offset + j],
+                            dst_offset + j);
    }
  }
 };

 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes) {
+void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
+                              common::Span<size_t> row_indexes) {
  monitor_.StartCuda("Compact");
-  CHECK_EQ(matrix.info.row_stride, page->matrix.info.row_stride);
-  CHECK_EQ(matrix.info.NumSymbols(), page->matrix.info.NumSymbols());
-  CHECK_LE(page->matrix.base_rowid + page->matrix.n_rows, row_indexes.size());
-  dh::LaunchN(device, page->matrix.n_rows, CompactPage(this, page, row_indexes));
+  CHECK_EQ(row_stride, page->row_stride);
+  CHECK_EQ(NumSymbols(), page->NumSymbols());
+  CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
+  gidx_buffer.SetDevice(device);
+  page->gidx_buffer.SetDevice(device);
+  dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
  monitor_.StopCuda("Compact");
 }

-// Construct an EllpackInfo based on histogram cuts of features.
-EllpackInfo::EllpackInfo(int device,
-                         bool is_dense,
-                         size_t row_stride,
-                         const common::HistogramCuts& hmat,
-                         dh::BulkAllocator* ba)
-    : is_dense(is_dense), row_stride(row_stride), n_bins(hmat.Ptrs().back()) {
-
-  ba->Allocate(device,
-               &feature_segments, hmat.Ptrs().size(),
-               &gidx_fvalue_map, hmat.Values().size(),
-               &min_fvalue, hmat.MinValues().size());
-  dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
-  dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
-  dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
-}
-
-// Initialize the EllpackInfo for this page.
-void EllpackPageImpl::InitInfo(int device,
-                               bool is_dense,
-                               size_t row_stride,
-                               const common::HistogramCuts& hmat) {
-  matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, &ba_);
-}
-
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device, size_t num_rows) {
-  size_t num_symbols = matrix.info.NumSymbols();
+void EllpackPageImpl::InitCompressedData(int device) {
+  size_t num_symbols = NumSymbols();

  // Required buffer size for storing data matrix in ELLPack format.
-  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
-      matrix.info.row_stride * num_rows, num_symbols);
-  ba_.Allocate(device, &gidx_buffer, compressed_size_bytes);
-
+  size_t compressed_size_bytes =
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
+                                                          num_symbols);
+  gidx_buffer.SetDevice(device);
+  // Don't call fill unnecessarily
+  if (gidx_buffer.Size() == 0) {
+    gidx_buffer.Resize(compressed_size_bytes, 0);
+  } else {
+    gidx_buffer.Resize(compressed_size_bytes, 0);
    thrust::fill(dh::tbegin(gidx_buffer), dh::tend(gidx_buffer), 0);
-
-  matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols);
+  }
 }

 // Compress a CSR page into ELLPACK.
 void EllpackPageImpl::CreateHistIndices(int device,
-                                        const SparsePage& row_batch,
-                                        const RowStateOnDevice& device_row_state) {
-  // Has any been allocated for me in this batch?
-  if (!device_row_state.rows_to_process_from_batch) return;
-
-  unsigned int null_gidx_value = matrix.info.n_bins;
-  size_t row_stride = matrix.info.row_stride;
+                                        const SparsePage& row_batch) {
+  if (row_batch.Size() == 0) return;
+  unsigned int null_gidx_value = NumSymbols() - 1;

  const auto& offset_vec = row_batch.offset.ConstHostVector();

  // bin and compress entries in batches of rows
-  size_t gpu_batch_nrows = std::min(
-      dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
-      static_cast<size_t>(device_row_state.rows_to_process_from_batch));
+  size_t gpu_batch_nrows =
+      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+               static_cast<size_t>(row_batch.Size()));
  const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();

-  size_t gpu_nbatches = common::DivRoundUp(device_row_state.rows_to_process_from_batch,
-                                           gpu_batch_nrows);
+  size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);

  for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
    size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
-    size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
-    if (batch_row_end > device_row_state.rows_to_process_from_batch) {
-      batch_row_end = device_row_state.rows_to_process_from_batch;
-    }
+    size_t batch_row_end =
+        std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size());
    size_t batch_nrows = batch_row_end - batch_row_begin;

-    const auto ent_cnt_begin =
-        offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin];
-    const auto ent_cnt_end =
-        offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end];
+    const auto ent_cnt_begin = offset_vec[batch_row_begin];
+    const auto ent_cnt_end = offset_vec[batch_row_end];

    /*! \brief row offset in SparsePage (the input data). */
    dh::device_vector<size_t> row_ptrs(batch_nrows + 1);
-    thrust::copy(
-        offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin,
-        offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1,
-        row_ptrs.begin());
+    thrust::copy(offset_vec.data() + batch_row_begin,
+                 offset_vec.data() + batch_row_end + 1, row_ptrs.begin());

    // number of entries in this batch.
    size_t n_entries = ent_cnt_end - ent_cnt_begin;
@@ -289,97 +259,50 @@ void EllpackPageImpl::CreateHistIndices(int device,
    // copy data entries to device.
    dh::safe_cuda(cudaMemcpy(entries_d.data().get(),
                             data_vec.data() + ent_cnt_begin,
-                             n_entries * sizeof(Entry),
-                             cudaMemcpyDefault));
+                             n_entries * sizeof(Entry), cudaMemcpyDefault));
    const dim3 block3(32, 8, 1);  // 256 threads
    const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
-                     common::DivRoundUp(row_stride, block3.y),
-                     1);
-    dh::LaunchKernel {grid3, block3} (
-        CompressBinEllpackKernel,
-        common::CompressedBufferWriter(matrix.info.NumSymbols()),
-        gidx_buffer.data(),
-        row_ptrs.data().get(),
-        entries_d.data().get(),
-        matrix.info.gidx_fvalue_map.data(),
-        matrix.info.feature_segments.data(),
-        device_row_state.total_rows_processed + batch_row_begin,
-        batch_nrows,
-        row_stride,
+                     common::DivRoundUp(row_stride, block3.y), 1);
+    auto device_accessor = GetDeviceAccessor(device);
+    dh::LaunchKernel {grid3, block3}(
+        CompressBinEllpackKernel, common::CompressedBufferWriter(NumSymbols()),
+        gidx_buffer.DevicePointer(), row_ptrs.data().get(),
+        entries_d.data().get(), device_accessor.gidx_fvalue_map.data(),
+        device_accessor.feature_segments.data(),
+        row_batch.base_rowid + batch_row_begin, batch_nrows, row_stride,
        null_gidx_value);
  }
 }

 // Return the number of rows contained in this page.
-size_t EllpackPageImpl::Size() const {
-  return matrix.n_rows;
-}
-
-// Clear the current page.
-void EllpackPageImpl::Clear() {
-  ba_.Clear();
-  gidx_buffer = {};
-  idx_buffer.clear();
-  sparse_page_.Clear();
-  matrix.base_rowid = 0;
-  matrix.n_rows = 0;
-  device_initialized_ = false;
-}
-
-// Push a CSR page to the current page.
-//
-// The CSR pages are accumulated in memory until they reach a certain size, then written out as
-// compressed ELLPACK.
-void EllpackPageImpl::Push(int device, const SparsePage& batch) {
-  sparse_page_.Push(batch);
-  matrix.n_rows += batch.Size();
-}
-
-// Compress the accumulated SparsePage.
-void EllpackPageImpl::CompressSparsePage(int device) {
-  monitor_.StartCuda("InitCompressedData");
-  InitCompressedData(device, matrix.n_rows);
-  monitor_.StopCuda("InitCompressedData");
-
-  monitor_.StartCuda("BinningCompression");
-  DeviceHistogramBuilderState hist_builder_row_state(matrix.n_rows);
-  hist_builder_row_state.BeginBatch(sparse_page_);
-  CreateHistIndices(device, sparse_page_, hist_builder_row_state.GetRowStateOnDevice());
-  hist_builder_row_state.EndBatch();
-  monitor_.StopCuda("BinningCompression");
-
-  monitor_.StartCuda("CopyDeviceToHost");
-  idx_buffer.resize(gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&idx_buffer, gidx_buffer);
-  ba_.Clear();
-  gidx_buffer = {};
-  monitor_.StopCuda("CopyDeviceToHost");
-}
+size_t EllpackPageImpl::Size() const { return n_rows; }

 // Return the memory cost for storing the compressed features.
-size_t EllpackPageImpl::MemCostBytes() const {
-  // Required buffer size for storing data matrix in ELLPack format.
-  size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
-      matrix.info.row_stride * matrix.n_rows, matrix.info.NumSymbols());
+size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
+                                     const common::HistogramCuts& cuts) {
+  // Required buffer size for storing data matrix in EtoLLPack format.
+  size_t compressed_size_bytes =
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * num_rows,
+                                                          cuts.TotalBins() + 1);
  return compressed_size_bytes;
 }

-// Copy the compressed features to GPU.
-void EllpackPageImpl::InitDevice(int device, EllpackInfo info) {
-  if (device_initialized_) return;
+EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(int device) const {
+  gidx_buffer.SetDevice(device);
+  return EllpackDeviceAccessor(
+      device, cuts_, is_dense, row_stride, base_rowid, n_rows,
+      common::CompressedIterator<uint32_t>(gidx_buffer.ConstDevicePointer(),
+                                           NumSymbols()));
+}

-  monitor_.StartCuda("CopyPageToDevice");
-  dh::safe_cuda(cudaSetDevice(device));
-
-  gidx_buffer = {};
-  ba_.Allocate(device, &gidx_buffer, idx_buffer.size());
-  dh::CopyVectorToDeviceSpan(gidx_buffer, idx_buffer);
-
-  matrix.info = info;
-  matrix.gidx_iter = common::CompressedIterator<uint32_t>(gidx_buffer.data(), info.n_bins + 1);
-
-  monitor_.StopCuda("CopyPageToDevice");
-
-  device_initialized_ = true;
+EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+                                 const SparsePage& page, bool is_dense,
+                                 size_t row_stride)
+    : cuts_(std::move(cuts)),
+      is_dense(is_dense),
+      n_rows(page.Size()),
+      row_stride(row_stride) {
+  this->InitCompressedData(device);
+  this->CreateHistIndices(device, page);
 }
 }  // namespace xgboost
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -40,71 +40,53 @@ __forceinline__ __device__ int BinarySearchRow(
  return -1;
 }

-/** \brief Meta information about the ELLPACK matrix. */
-struct EllpackInfo {
+/** \brief Struct for accessing and manipulating an ellpack matrix on the
+ * device. Does not own underlying memory and may be trivially copied into
+ * kernels.*/
+struct EllpackDeviceAccessor {
  /*! \brief Whether or not if the matrix is dense. */
  bool is_dense;
  /*! \brief Row length for ELLPack, equal to number of features. */
  size_t row_stride;
-  /*! \brief Total number of bins, also used as the null index value, . */
-  size_t n_bins;
-  /*! \brief Minimum value for each feature. Size equals to number of features. */
-  common::Span<bst_float> min_fvalue;
-  /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
-  common::Span<uint32_t> feature_segments;
-  /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
-  common::Span<bst_float> gidx_fvalue_map;
-
-  EllpackInfo() = default;
-
-  /*!
-   * \brief Constructor.
-   *
-   * @param device The GPU device to use.
-   * @param is_dense Whether the matrix is dense.
-   * @param row_stride The number of features between starts of consecutive rows.
-   * @param hmat The histogram cuts of all the features.
-   * @param ba The BulkAllocator that owns the GPU memory.
-   */
-  explicit EllpackInfo(int device,
-                       bool is_dense,
-                       size_t row_stride,
-                       const common::HistogramCuts& hmat,
-                       dh::BulkAllocator* ba);
-
-  /*! \brief Return the total number of symbols (total number of bins plus 1 for not found). */
-  size_t NumSymbols() const {
-    return n_bins + 1;
-  }
-  size_t NumFeatures() const {
-    return min_fvalue.size();
-  }
-};
-
-/** \brief Struct for accessing and manipulating an ellpack matrix on the
- * device. Does not own underlying memory and may be trivially copied into
- * kernels.*/
-struct EllpackMatrix {
-  EllpackInfo info;
  size_t base_rowid{};
  size_t n_rows{};
  common::CompressedIterator<uint32_t> gidx_iter;
+  /*! \brief Minimum value for each feature. Size equals to number of features. */
+  common::Span<const bst_float> min_fvalue;
+  /*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
+  common::Span<const uint32_t> feature_segments;
+  /*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
+  common::Span<const bst_float> gidx_fvalue_map;

+  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
+                        bool is_dense, size_t row_stride, size_t base_rowid,
+                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter)
+      : is_dense(is_dense),
+        row_stride(row_stride),
+        base_rowid(base_rowid),
+        n_rows(n_rows) ,gidx_iter(gidx_iter){
+    cuts.cut_values_.SetDevice(device);
+    cuts.cut_ptrs_.SetDevice(device);
+    cuts.min_vals_.SetDevice(device);
+    gidx_fvalue_map = cuts.cut_values_.ConstDeviceSpan();
+    feature_segments = cuts.cut_ptrs_.ConstDeviceSpan();
+    min_fvalue = cuts.min_vals_.ConstDeviceSpan();
+  }
  // Get a matrix element, uses binary search for look up Return NaN if missing
  // Given a row index and a feature index, returns the corresponding cut value
  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
    ridx -= base_rowid;
-    auto row_begin = info.row_stride * ridx;
-    auto row_end = row_begin + info.row_stride;
+    auto row_begin = row_stride * ridx;
+    auto row_end = row_begin + row_stride;
    auto gidx = -1;
-    if (info.is_dense) {
+    if (is_dense) {
      gidx = gidx_iter[row_begin + fidx];
    } else {
      gidx = BinarySearchRow(row_begin,
                             row_end,
                             gidx_iter,
-                             info.feature_segments[fidx],
-                             info.feature_segments[fidx + 1]);
+                             feature_segments[fidx],
+                             feature_segments[fidx + 1]);
    }
    return gidx;
  }
@@ -113,97 +95,27 @@ struct EllpackMatrix {
    if (gidx == -1) {
      return nan("");
    }
-    return info.gidx_fvalue_map[gidx];
+    return gidx_fvalue_map[gidx];
  }

  // Check if the row id is withing range of the current batch.
  __device__ bool IsInRange(size_t row_id) const {
    return row_id >= base_rowid && row_id < base_rowid + n_rows;
  }
+  /*! \brief Return the total number of symbols (total number of bins plus 1 for
+   * not found). */
+  size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
+
+  size_t NullValue() const { return gidx_fvalue_map.size(); }
+
+  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
+
+  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };

-// Instances of this type are created while creating the histogram bins for the
-// entire dataset across multiple sparse page batches. This keeps track of the number
-// of rows to process from a batch and the position from which to process on each device.
-struct RowStateOnDevice {
-  // Number of rows assigned to this device
-  size_t total_rows_assigned_to_device;
-  // Number of rows processed thus far
-  size_t total_rows_processed;
-  // Number of rows to process from the current sparse page batch
-  size_t rows_to_process_from_batch;
-  // Offset from the current sparse page batch to begin processing
-  size_t row_offset_in_current_batch;
-
-  explicit RowStateOnDevice(size_t total_rows)
-      : total_rows_assigned_to_device(total_rows), total_rows_processed(0),
-        rows_to_process_from_batch(0), row_offset_in_current_batch(0) {
-  }
-
-  explicit RowStateOnDevice(size_t total_rows, size_t batch_rows)
-      : total_rows_assigned_to_device(total_rows), total_rows_processed(0),
-        rows_to_process_from_batch(batch_rows), row_offset_in_current_batch(0) {
-  }
-
-  // Advance the row state by the number of rows processed
-  void Advance() {
-    total_rows_processed += rows_to_process_from_batch;
-    CHECK_LE(total_rows_processed, total_rows_assigned_to_device);
-    rows_to_process_from_batch = row_offset_in_current_batch = 0;
-  }
-};
-
-// An instance of this type is created which keeps track of total number of rows to process,
-// rows processed thus far, rows to process and the offset from the current sparse page batch
-// to begin processing on each device
-class DeviceHistogramBuilderState {
- public:
-  explicit DeviceHistogramBuilderState(size_t n_rows) : device_row_state_(n_rows) {}
-
-  const RowStateOnDevice& GetRowStateOnDevice() const {
-    return device_row_state_;
-  }
-
-  // This method is invoked at the beginning of each sparse page batch. This distributes
-  // the rows in the sparse page to the device.
-  // TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins.
-  void BeginBatch(const SparsePage &batch) {
-    size_t rem_rows = batch.Size();
-    size_t row_offset_in_current_batch = 0;
-
-    // Do we have anymore left to process from this batch on this device?
-    if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) {
-      // There are still some rows that needs to be assigned to this device
-      device_row_state_.rows_to_process_from_batch =
-          std::min(
-              device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed,
-              rem_rows);
-    } else {
-      // All rows have been assigned to this device
-      device_row_state_.rows_to_process_from_batch = 0;
-    }
-
-    device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch;
-    row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch;
-    rem_rows -= device_row_state_.rows_to_process_from_batch;
-  }
-
-  // This method is invoked after completion of each sparse page batch
-  void EndBatch() {
-    device_row_state_.Advance();
-  }
-
- private:
-  RowStateOnDevice device_row_state_{0};
-};

 class EllpackPageImpl {
 public:
-  EllpackMatrix matrix;
-  /*! \brief global index of histogram, which is stored in ELLPack format. */
-  common::Span<common::CompressedByteT> gidx_buffer;
-  std::vector<common::CompressedByteT> idx_buffer;
-
  /*!
   * \brief Default constructor.
   *
@@ -218,7 +130,12 @@ class EllpackPageImpl {
   * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
   * and the given number of rows.
   */
-  explicit EllpackPageImpl(int device, EllpackInfo info, size_t n_rows);
+  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
+                  size_t row_stride, size_t n_rows);
+
+  EllpackPageImpl(int device, common::HistogramCuts cuts,
+                  const SparsePage& page,
+                  bool is_dense,size_t row_stride);

  /*!
   * \brief Constructor from an existing DMatrix.
@@ -245,77 +162,53 @@ class EllpackPageImpl {
   */
  void Compact(int device, EllpackPageImpl* page, common::Span<size_t> row_indexes);

-  /*!
-   * \brief Initialize the EllpackInfo contained in the EllpackMatrix.
-   *
-   * This is used in the in-memory case. The current page owns the BulkAllocator, which in turn owns
-   * the GPU memory used by the EllpackInfo.
-   *
-   * @param device The GPU device to use.
-   * @param is_dense Whether the matrix is dense.
-   * @param row_stride The number of features between starts of consecutive rows.
-   * @param hmat The histogram cuts of all the features.
-   */
-  void InitInfo(int device, bool is_dense, size_t row_stride, const common::HistogramCuts& hmat);
-
-  /*!
-   * \brief Initialize the buffer to store compressed features.
-   *
-   * @param device The GPU device to use.
-   * @param num_rows The number of rows we are storing in the buffer.
-   */
-  void InitCompressedData(int device, size_t num_rows);
-
-  /*!
-   * \brief Compress a single page of CSR data into ELLPACK.
-   *
-   * @param device The GPU device to use.
-   * @param row_batch The CSR page.
-   * @param device_row_state On-device data for maintaining state.
-   */
-  void CreateHistIndices(int device,
-                         const SparsePage& row_batch,
-                         const RowStateOnDevice& device_row_state);

  /*! \return Number of instances in the page. */
  size_t Size() const;

  /*! \brief Set the base row id for this page. */
-  inline void SetBaseRowId(size_t row_id) {
-    matrix.base_rowid = row_id;
+  void SetBaseRowId(size_t row_id) {
+    base_rowid = row_id;
  }

-  /*! \brief clear the page. */
-  void Clear();
-
-  /*!
-   * \brief Push a sparse page.
-   * \param batch The row page.
-   */
-  void Push(int device, const SparsePage& batch);
-
  /*! \return Estimation of memory cost of this page. */
-  size_t MemCostBytes() const;
+  static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;

-  /*!
-   * \brief Copy the ELLPACK matrix to GPU.
-   *
-   * @param device The GPU device to use.
-   * @param info The EllpackInfo for the matrix.
-   */
-  void InitDevice(int device, EllpackInfo info);

-  /*! \brief Compress the accumulated SparsePage into ELLPACK format.
-   *
-   * @param device The GPU device to use.
-   */
-  void CompressSparsePage(int device);
+  /*! \brief Return the total number of symbols (total number of bins plus 1 for
+   * not found). */
+  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
+
+  EllpackDeviceAccessor GetDeviceAccessor(int device) const;

 private:
+  /*!
+   * \brief Compress a single page of CSR data into ELLPACK.
+   *
+   * @param device The GPU device to use.
+   * @param row_batch The CSR page.
+   */
+  void CreateHistIndices(int device,
+                         const SparsePage& row_batch
+                         );
+  /*!
+   * \brief Initialize the buffer to store compressed features.
+   */
+  void InitCompressedData(int device);
+
+
+public:
+  /*! \brief Whether or not if the matrix is dense. */
+  bool is_dense;
+  /*! \brief Row length for ELLPack. */
+  size_t row_stride;
+  size_t base_rowid{0};
+  size_t n_rows{};
+  /*! \brief global index of histogram, which is stored in ELLPack format. */
+  HostDeviceVector<common::CompressedByteT> gidx_buffer;
+  common::HistogramCuts cuts_;
+private:
  common::Monitor monitor_;
-  dh::BulkAllocator ba_;
-  bool device_initialized_{false};
-  SparsePage sparse_page_{};
 };

 }  // namespace xgboost
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -17,26 +17,35 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
 public:
  bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
    auto* impl = page->Impl();
-    impl->Clear();
-    if (!fi->Read(&impl->matrix.n_rows)) return false;
-    return fi->Read(&impl->idx_buffer);
+    fi->Read(&impl->cuts_.cut_values_.HostVector());
+    fi->Read(&impl->cuts_.cut_ptrs_.HostVector());
+    fi->Read(&impl->cuts_.min_vals_.HostVector());
+    fi->Read(&impl->n_rows);
+    fi->Read(&impl->is_dense);
+    fi->Read(&impl->row_stride);
+    if (!fi->Read(&impl->gidx_buffer.HostVector())) {
+      return false;
+    }
+    return true;
  }

  bool Read(EllpackPage* page,
            dmlc::SeekStream* fi,
            const std::vector<bst_uint>& sorted_index_set) override {
-    auto* impl = page->Impl();
-    impl->Clear();
-    if (!fi->Read(&impl->matrix.n_rows))  return false;
-    return fi->Read(&page->Impl()->idx_buffer);
+    LOG(FATAL) << "Not implemented";
+    return false;
  }

  void Write(const EllpackPage& page, dmlc::Stream* fo) override {
    auto* impl = page.Impl();
-    fo->Write(impl->matrix.n_rows);
-    auto buffer = impl->idx_buffer;
-    CHECK(!buffer.empty());
-    fo->Write(buffer);
+    fo->Write(impl->cuts_.cut_values_.ConstHostVector());
+    fo->Write(impl->cuts_.cut_ptrs_.ConstHostVector());
+    fo->Write(impl->cuts_.min_vals_.ConstHostVector());
+    fo->Write(impl->n_rows);
+    fo->Write(impl->is_dense);
+    fo->Write(impl->row_stride);
+    CHECK(!impl->gidx_buffer.ConstHostVector().empty());
+    fo->Write(impl->gidx_buffer.HostVector());
  }
 };

--- a/src/data/ellpack_page_source.cc
+++ b/src/data/ellpack_page_source.cc
@@ -2,45 +2,23 @@
 * Copyright 2019 XGBoost contributors
 */
 #ifndef XGBOOST_USE_CUDA
+#include <dmlc/base.h>
+#if DMLC_ENABLE_STD_THREAD

-#include <xgboost/data.h>
 #include "ellpack_page_source.h"
+#include <xgboost/data.h>
 namespace xgboost {
 namespace data {

 EllpackPageSource::EllpackPageSource(DMatrix* dmat,
                                     const std::string& cache_info,
                                     const BatchParam& param) noexcept(false) {
-  LOG(FATAL) << "Internal Error: "
+  LOG(FATAL)
+      << "Internal Error: "
         "XGBoost is not compiled with CUDA but EllpackPageSource is required";
 }

-void EllpackPageSource::BeforeFirst() {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-}
-
-bool EllpackPageSource::Next() {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-  return false;
-}
-
-EllpackPage& EllpackPageSource::Value() {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-  EllpackPage* page { nullptr };
-  return *page;
-}
-
-const EllpackPage& EllpackPageSource::Value() const {
-  LOG(FATAL) << "Internal Error: "
-                "XGBoost is not compiled with CUDA but EllpackPageSource is required";
-  EllpackPage* page { nullptr };
-  return *page;
-}
-
 }  // namespace data
 }  // namespace xgboost
-
+#endif  // DMLC_ENABLE_STD_THREAD
 #endif  // XGBOOST_USE_CUDA
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -3,73 +3,16 @@
 */
 #include <memory>
 #include <utility>
-#include <vector>

 #include "../common/hist_util.h"

+#include "ellpack_page.cuh"
 #include "ellpack_page_source.h"
 #include "sparse_page_source.h"
-#include "ellpack_page.cuh"

 namespace xgboost {
 namespace data {

-class EllpackPageSourceImpl : public DataSource<EllpackPage> {
- public:
-  /*!
-   * \brief Create source from cache files the cache_prefix.
-   * \param cache_prefix The prefix of cache we want to solve.
-   */
-  explicit EllpackPageSourceImpl(DMatrix* dmat,
-                                 const std::string& cache_info,
-                                 const BatchParam& param) noexcept(false);
-
-  /*! \brief destructor */
-  ~EllpackPageSourceImpl() override = default;
-
-  void BeforeFirst() override;
-  bool Next() override;
-  EllpackPage& Value();
-  const EllpackPage& Value() const override;
-
- private:
-  /*! \brief Write Ellpack pages after accumulating them in memory. */
-  void WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const;
-
-  /*! \brief The page type string for ELLPACK. */
-  const std::string kPageType_{".ellpack.page"};
-
-  int device_{-1};
-  size_t page_size_{DMatrix::kPageSize};
-  common::Monitor monitor_;
-  dh::BulkAllocator ba_;
-  /*! \brief The EllpackInfo, with the underlying GPU memory shared by all pages. */
-  EllpackInfo ellpack_info_;
-  std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> source_;
-  std::string cache_info_;
-};
-
-EllpackPageSource::EllpackPageSource(DMatrix* dmat,
-                                     const std::string& cache_info,
-                                     const BatchParam& param) noexcept(false)
-    : impl_{new EllpackPageSourceImpl(dmat, cache_info, param)} {}
-
-void EllpackPageSource::BeforeFirst() {
-  impl_->BeforeFirst();
-}
-
-bool EllpackPageSource::Next() {
-  return impl_->Next();
-}
-
-EllpackPage& EllpackPageSource::Value() {
-  return impl_->Value();
-}
-
-const EllpackPage& EllpackPageSource::Value() const {
-  return impl_->Value();
-}
-
 size_t GetRowStride(DMatrix* dmat) {
  if (dmat->IsDense()) return dmat->Info().num_col_;

@@ -86,17 +29,19 @@ size_t GetRowStride(DMatrix* dmat) {

 // Build the quantile sketch across the whole input data, then use the histogram cuts to compress
 // each CSR page, and write the accumulated ELLPACK pages to disk.
-EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
+EllpackPageSource::EllpackPageSource(DMatrix* dmat,
                                     const std::string& cache_info,
-                                             const BatchParam& param) noexcept(false)
-    : device_(param.gpu_id), cache_info_(cache_info) {
-
+                                     const BatchParam& param) noexcept(false) {
+  cache_info_ = ParseCacheInfo(cache_info, kPageType_);
+  for (auto file : cache_info_.name_shards) {
+    CheckCacheFileExists(file);
+  }
  if (param.gpu_page_size > 0) {
    page_size_ = param.gpu_page_size;
  }

  monitor_.Init("ellpack_page_source");
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(param.gpu_id));

  monitor_.StartCuda("Quantiles");
  size_t row_stride = GetRowStride(dmat);
@@ -104,75 +49,52 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
                                   param.gpu_batch_nrows);
  monitor_.StopCuda("Quantiles");

-  monitor_.StartCuda("CreateEllpackInfo");
-  ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, cuts, &ba_);
-  monitor_.StopCuda("CreateEllpackInfo");
-
  monitor_.StartCuda("WriteEllpackPages");
-  WriteEllpackPages(dmat, cache_info);
+  WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
  monitor_.StopCuda("WriteEllpackPages");

-  source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
-      ParseCacheInfo(cache_info_, kPageType_)));
-}
-
-void EllpackPageSourceImpl::BeforeFirst() {
-  source_.reset(new ExternalMemoryPrefetcher<EllpackPage>(
-      ParseCacheInfo(cache_info_, kPageType_)));
-  source_->BeforeFirst();
-}
-
-bool EllpackPageSourceImpl::Next() {
-  return source_->Next();
-}
-
-EllpackPage& EllpackPageSourceImpl::Value() {
-  EllpackPage& page = source_->Value();
-  page.Impl()->InitDevice(device_, ellpack_info_);
-  return page;
-}
-
-const EllpackPage& EllpackPageSourceImpl::Value() const {
-  EllpackPage& page = source_->Value();
-  page.Impl()->InitDevice(device_, ellpack_info_);
-  return page;
+  external_prefetcher_.reset(
+      new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
 }

 // Compress each CSR page to ELLPACK, and write the accumulated pages to disk.
-void EllpackPageSourceImpl::WriteEllpackPages(DMatrix* dmat, const std::string& cache_info) const {
+void EllpackPageSource::WriteEllpackPages(int device, DMatrix* dmat,
+                                          const common::HistogramCuts& cuts,
+                                          const std::string& cache_info,
+                                          size_t row_stride) const {
  auto cinfo = ParseCacheInfo(cache_info, kPageType_);
  const size_t extra_buffer_capacity = 6;
-  SparsePageWriter<EllpackPage> writer(
-      cinfo.name_shards, cinfo.format_shards, extra_buffer_capacity);
+  SparsePageWriter<EllpackPage> writer(cinfo.name_shards, cinfo.format_shards,
+                                       extra_buffer_capacity);
  std::shared_ptr<EllpackPage> page;
+  SparsePage temp_host_page;
  writer.Alloc(&page);
  auto* impl = page->Impl();
-  impl->matrix.info = ellpack_info_;
-  impl->Clear();

-  const MetaInfo& info = dmat->Info();
  size_t bytes_write = 0;
  double tstart = dmlc::GetTime();
  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    impl->Push(device_, batch);
+    temp_host_page.Push(batch);

-    size_t mem_cost_bytes = impl->MemCostBytes();
+    size_t mem_cost_bytes =
+        EllpackPageImpl::MemCostBytes(temp_host_page.Size(), row_stride, cuts);
    if (mem_cost_bytes >= page_size_) {
      bytes_write += mem_cost_bytes;
-      impl->CompressSparsePage(device_);
+      *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
+                              row_stride);
      writer.PushWrite(std::move(page));
      writer.Alloc(&page);
      impl = page->Impl();
-      impl->matrix.info = ellpack_info_;
-      impl->Clear();
+      temp_host_page.Clear();
      double tdiff = dmlc::GetTime() - tstart;
      LOG(INFO) << "Writing " << kPageType_ << " to " << cache_info << " in "
                << ((bytes_write >> 20UL) / tdiff) << " MB/s, "
                << (bytes_write >> 20UL) << " written";
    }
  }
-  if (impl->Size() != 0) {
-    impl->CompressSparsePage(device_);
+  if (temp_host_page.Size() != 0) {
+    *impl = EllpackPageImpl(device, cuts, temp_host_page, dmat->IsDense(),
+                            row_stride);
    writer.PushWrite(std::move(page));
  }
 }
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -10,19 +10,17 @@
 #include <string>

 #include "../common/timer.h"
+#include "../common/hist_util.h"
+#include "sparse_page_source.h"

 namespace xgboost {
 namespace data {

-class EllpackPageSourceImpl;
-
 /*!
 * \brief External memory data source for ELLPACK format.
 *
- * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
- * including CUDA-specific implementation details in the header.
 */
-class EllpackPageSource : public DataSource<EllpackPage> {
+class EllpackPageSource {
 public:
  /*!
   * \brief Create source from cache files the cache_prefix.
@@ -32,19 +30,33 @@ class EllpackPageSource : public DataSource<EllpackPage> {
                             const std::string& cache_info,
                             const BatchParam& param) noexcept(false);

-  /*! \brief destructor */
-  ~EllpackPageSource() override = default;
+  BatchSet<EllpackPage> GetBatchSet() {
+    auto begin_iter = BatchIterator<EllpackPage>(
+        new SparseBatchIteratorImpl<ExternalMemoryPrefetcher<EllpackPage>,
+                                    EllpackPage>(external_prefetcher_.get()));
+    return BatchSet<EllpackPage>(begin_iter);
+  }

-  void BeforeFirst() override;
-  bool Next() override;
-  EllpackPage& Value();
-  const EllpackPage& Value() const override;
-
-  const EllpackPageSourceImpl* Impl() const { return impl_.get(); }
-  EllpackPageSourceImpl* Impl() { return impl_.get(); }
+  ~EllpackPageSource() {
+    external_prefetcher_.reset();
+    for (auto file : cache_info_.name_shards) {
+      TryDeleteCacheFile(file);
+    }
+  }

 private:
-  std::shared_ptr<EllpackPageSourceImpl> impl_;
+  void WriteEllpackPages(int device, DMatrix* dmat,
+                         const common::HistogramCuts& cuts,
+                         const std::string& cache_info,
+                         size_t row_stride) const;
+
+  /*! \brief The page type string for ELLPACK. */
+  const std::string kPageType_{".ellpack.page"};
+
+  size_t page_size_{DMatrix::kPageSize};
+  common::Monitor monitor_;
+  std::unique_ptr<ExternalMemoryPrefetcher<EllpackPage>> external_prefetcher_;
+  CacheInfo cache_info_;
 };

 }  // namespace data
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -51,11 +51,7 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
    ellpack_source_.reset(new EllpackPageSource(this, cache_info_, param));
    batch_param_ = param;
  }
-  ellpack_source_->BeforeFirst();
-  ellpack_source_->Next();
-  auto begin_iter = BatchIterator<EllpackPage>(
-      new SparseBatchIteratorImpl<EllpackPageSource, EllpackPage>(ellpack_source_.get()));
-  return BatchSet<EllpackPage>(begin_iter);
+  return ellpack_source_->GetBatchSet();
 }

 }  // namespace data
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -97,9 +97,11 @@ struct SparsePageLoader {
 };

 struct EllpackLoader {
-  EllpackMatrix const& matrix;
-  XGBOOST_DEVICE EllpackLoader(EllpackMatrix const& m, bool use_shared, bst_feature_t num_features,
-                               bst_row_t num_rows, size_t entry_start) : matrix{m} {}
+  EllpackDeviceAccessor const& matrix;
+  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool use_shared,
+                               bst_feature_t num_features, bst_row_t num_rows,
+                               size_t entry_start)
+      : matrix{m} {}
  __device__ __forceinline__ float GetFvalue(int ridx, int fidx) const {
    auto gidx = matrix.GetBinIndex(ridx, fidx);
    if (gidx == -1) {
@@ -107,10 +109,10 @@ struct EllpackLoader {
    }
    // The gradient index needs to be shifted by one as min values are not included in the
    // cuts.
-    if (gidx == matrix.info.feature_segments[fidx]) {
-      return matrix.info.min_fvalue[fidx];
+    if (gidx == matrix.feature_segments[fidx]) {
+      return matrix.min_fvalue[fidx];
    }
-    return matrix.info.gidx_fvalue_map[gidx - 1];
+    return matrix.gidx_fvalue_map[gidx - 1];
  }
 };

@@ -217,7 +219,7 @@ class GPUPredictor : public xgboost::Predictor {
        this->tree_begin_, this->tree_end_, num_features, num_rows,
        entry_start, use_shared, this->num_group_);
  }
-  void PredictInternal(EllpackMatrix const& batch, HostDeviceVector<bst_float>* out_preds,
+  void PredictInternal(EllpackDeviceAccessor const& batch, HostDeviceVector<bst_float>* out_preds,
                       size_t batch_offset) {
    const uint32_t BLOCK_THREADS = 256;
    size_t num_rows = batch.n_rows;
@@ -226,11 +228,11 @@ class GPUPredictor : public xgboost::Predictor {
    bool use_shared = false;
    size_t entry_start = 0;
    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
-        PredictKernel<EllpackLoader, EllpackMatrix>,
+        PredictKernel<EllpackLoader, EllpackDeviceAccessor>,
        batch,
        dh::ToSpan(nodes_), out_preds->DeviceSpan().subspan(batch_offset),
        dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
-        this->tree_begin_, this->tree_end_, batch.info.NumFeatures(), num_rows,
+        this->tree_begin_, this->tree_end_, batch.NumFeatures(), num_rows,
        entry_start, use_shared, this->num_group_);
  }

@@ -269,8 +271,10 @@ class GPUPredictor : public xgboost::Predictor {
    if (dmat->PageExists<EllpackPage>()) {
      size_t batch_offset = 0;
      for (auto const& page : dmat->GetBatches<EllpackPage>()) {
-        this->PredictInternal(page.Impl()->matrix, out_preds, batch_offset);
-        batch_offset += page.Impl()->matrix.n_rows;
+        this->PredictInternal(
+            page.Impl()->GetDeviceAccessor(generic_param_->gpu_id), out_preds,
+            batch_offset);
+        batch_offset += page.Impl()->n_rows;
      }
    } else {
      size_t batch_offset = 0;
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -153,7 +153,8 @@ ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page,
                                                   size_t n_rows,
                                                   const BatchParam& batch_param)
    : batch_param_(batch_param),
-      page_(new EllpackPageImpl(batch_param.gpu_id, page->matrix.info, n_rows)) {}
+      page_(new EllpackPageImpl(batch_param.gpu_id, page->cuts_, page->is_dense,
+                                page->row_stride, n_rows)) {}

 GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair> gpair,
                                                     DMatrix* dmat) {
@@ -217,9 +218,9 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP

  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(batch_param_.gpu_id,
-                                  original_page_->matrix.info,
-                                  sample_rows));
+  page_.reset(new EllpackPageImpl(
+      batch_param_.gpu_id, original_page_->cuts_, original_page_->is_dense,
+                                  original_page_->row_stride, sample_rows));

  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
@@ -298,9 +299,9 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra

  // Create a new ELLPACK page with empty rows.
  page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(batch_param_.gpu_id,
-                                  original_page_->matrix.info,
-                                  sample_rows));
+  page_.reset(new EllpackPageImpl(batch_param_.gpu_id, original_page_->cuts_,
+                                  original_page_->is_dense,
+                                  original_page_->row_stride, sample_rows));

  // Compact the ELLPACK pages into the single sample page.
  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
@@ -319,7 +320,7 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
  monitor_.Init("gradient_based_sampler");

  bool is_sampling = subsample < 1.0;
-  bool is_external_memory = page->matrix.n_rows != n_rows;
+  bool is_external_memory = page->n_rows != n_rows;

  if (is_sampling) {
    switch (sampling_method) {
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -101,7 +101,7 @@ template GradientPairPrecise CreateRoundingFactor(common::Span<GradientPair cons
 template GradientPair CreateRoundingFactor(common::Span<GradientPair const> gpair);

 template <typename GradientSumT>
-__global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
+__global__ void SharedMemHistKernel(EllpackDeviceAccessor matrix,
                                    common::Span<const RowPartitioner::RowIndexT> d_ridx,
                                    GradientSumT* __restrict__ d_node_hist,
                                    const GradientPair* __restrict__ d_gpair,
@@ -112,14 +112,14 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
  extern __shared__ char smem[];
  GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem);  // NOLINT
  if (use_shared_memory_histograms) {
-    dh::BlockFill(smem_arr, matrix.info.n_bins, GradientSumT());
+    dh::BlockFill(smem_arr, matrix.NumBins(), GradientSumT());
    __syncthreads();
  }
  for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
-    int ridx = d_ridx[idx / matrix.info.row_stride];
+    int ridx = d_ridx[idx / matrix.row_stride];
    int gidx =
-        matrix.gidx_iter[ridx * matrix.info.row_stride + idx % matrix.info.row_stride];
-    if (gidx != matrix.info.n_bins) {
+        matrix.gidx_iter[ridx * matrix.row_stride + idx % matrix.row_stride];
+    if (gidx != matrix.NumBins()) {
      GradientSumT truncated {
        TruncateWithRoundingFactor<T>(rounding.GetGrad(), d_gpair[ridx].GetGrad()),
        TruncateWithRoundingFactor<T>(rounding.GetHess(), d_gpair[ridx].GetHess()),
@@ -135,7 +135,7 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
  if (use_shared_memory_histograms) {
    // Write shared memory back to global memory
    __syncthreads();
-    for (auto i : dh::BlockStrideRange(static_cast<size_t>(0), matrix.info.n_bins)) {
+    for (auto i : dh::BlockStrideRange(static_cast<size_t>(0), matrix.NumBins())) {
      GradientSumT truncated {
        TruncateWithRoundingFactor<T>(rounding.GetGrad(), smem_arr[i].GetGrad()),
        TruncateWithRoundingFactor<T>(rounding.GetHess(), smem_arr[i].GetHess()),
@@ -146,16 +146,16 @@ __global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
 }

 template <typename GradientSumT>
-void BuildGradientHistogram(EllpackMatrix const& matrix,
+void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
                            common::Span<GradientPair const> gpair,
                            common::Span<const uint32_t> d_ridx,
                            common::Span<GradientSumT> histogram,
                            GradientSumT rounding, bool shared) {
  const size_t smem_size =
      shared
-      ? sizeof(GradientSumT) * matrix.info.n_bins
+      ? sizeof(GradientSumT) * matrix.NumBins()
      : 0;
-  auto n_elements = d_ridx.size() * matrix.info.row_stride;
+  auto n_elements = d_ridx.size() * matrix.row_stride;

  uint32_t items_per_thread = 8;
  uint32_t block_threads = 256;
@@ -168,14 +168,14 @@ void BuildGradientHistogram(EllpackMatrix const& matrix,
 }

 template void BuildGradientHistogram<GradientPair>(
-    EllpackMatrix const& matrix,
+    EllpackDeviceAccessor const& matrix,
    common::Span<GradientPair const> gpair,
    common::Span<const uint32_t> ridx,
    common::Span<GradientPair> histogram,
    GradientPair rounding, bool shared);

 template void BuildGradientHistogram<GradientPairPrecise>(
-    EllpackMatrix const& matrix,
+    EllpackDeviceAccessor const& matrix,
    common::Span<GradientPair const> gpair,
    common::Span<const uint32_t> ridx,
    common::Span<GradientPairPrecise> histogram,
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -18,7 +18,7 @@ DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x)
 }

 template <typename GradientSumT>
-void BuildGradientHistogram(EllpackMatrix const& matrix,
+void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
                            common::Span<GradientPair const> gpair,
                            common::Span<const uint32_t> ridx,
                            common::Span<GradientSumT> histogram,
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -180,15 +180,15 @@ template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
          typename MaxReduceT, typename TempStorageT, typename GradientSumT>
 __device__ void EvaluateFeature(
    int fidx, common::Span<const GradientSumT> node_histogram,
-    const xgboost::EllpackMatrix& matrix,
+    const EllpackDeviceAccessor& matrix,
    DeviceSplitCandidate* best_split,  // shared memory storing best split
    const DeviceNodeStats& node, const GPUTrainingParam& param,
    TempStorageT* temp_storage,  // temp memory for cub operations
    int constraint,              // monotonic_constraints
    const ValueConstraint& value_constraint) {
  // Use pointer from cut to indicate begin and end of bins for each feature.
-  uint32_t gidx_begin = matrix.info.feature_segments[fidx];  // begining bin
-  uint32_t gidx_end = matrix.info.feature_segments[fidx + 1];  // end bin for i^th feature
+  uint32_t gidx_begin = matrix.feature_segments[fidx];  // begining bin
+  uint32_t gidx_end = matrix.feature_segments[fidx + 1];  // end bin for i^th feature

  // Sum histogram bins for current feature
  GradientSumT const feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>(
@@ -236,9 +236,9 @@ __device__ void EvaluateFeature(
      int split_gidx = (scan_begin + threadIdx.x) - 1;
      float fvalue;
      if (split_gidx < static_cast<int>(gidx_begin)) {
-        fvalue =  matrix.info.min_fvalue[fidx];
+        fvalue =  matrix.min_fvalue[fidx];
      } else {
-        fvalue = matrix.info.gidx_fvalue_map[split_gidx];
+        fvalue = matrix.gidx_fvalue_map[split_gidx];
      }
      GradientSumT left = missing_left ? bin + missing : bin;
      GradientSumT right = parent_sum - left;
@@ -254,7 +254,7 @@ __global__ void EvaluateSplitKernel(
    common::Span<const GradientSumT> node_histogram,  // histogram for gradients
    common::Span<const bst_feature_t> feature_set,    // Selected features
    DeviceNodeStats node,
-    xgboost::EllpackMatrix matrix,
+    xgboost::EllpackDeviceAccessor matrix,
    GPUTrainingParam gpu_param,
    common::Span<DeviceSplitCandidate> split_candidates,  // resulting split
    ValueConstraint value_constraint,
@@ -601,7 +601,7 @@ struct GPUHistMakerDevice {
      uint32_t constexpr kBlockThreads = 256;
      dh::LaunchKernel {uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]} (
          EvaluateSplitKernel<kBlockThreads, GradientSumT>,
-          hist.GetNodeHistogram(nidx), d_feature_set, node, page->matrix,
+          hist.GetNodeHistogram(nidx), d_feature_set, node, page->GetDeviceAccessor(device_id),
          gpu_param, d_split_candidates, node_value_constraints[nidx],
          monotone_constraints);

@@ -625,9 +625,7 @@ struct GPUHistMakerDevice {
    hist.AllocateHistogram(nidx);
    auto d_node_hist = hist.GetNodeHistogram(nidx);
    auto d_ridx = row_partitioner->GetRows(nidx);
-    auto d_gpair = gpair.data();
-
-    BuildGradientHistogram(page->matrix, gpair, d_ridx, d_node_hist,
+    BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist,
                           histogram_rounding, use_shared_memory_histograms);
  }

@@ -637,7 +635,7 @@ struct GPUHistMakerDevice {
    auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
    auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);

-    dh::LaunchN(device_id, page->matrix.info.n_bins, [=] __device__(size_t idx) {
+    dh::LaunchN(device_id, page->cuts_.TotalBins(), [=] __device__(size_t idx) {
      d_node_hist_subtraction[idx] =
          d_node_hist_parent[idx] - d_node_hist_histogram[idx];
    });
@@ -652,7 +650,7 @@ struct GPUHistMakerDevice {
  }

  void UpdatePosition(int nidx, RegTree::Node split_node) {
-    auto d_matrix = page->matrix;
+    auto d_matrix = page->GetDeviceAccessor(device_id);

    row_partitioner->UpdatePosition(
        nidx, split_node.LeftChild(), split_node.RightChild(),
@@ -689,7 +687,7 @@ struct GPUHistMakerDevice {
      row_partitioner.reset();  // Release the device memory first before reallocating
      row_partitioner.reset(new RowPartitioner(device_id, p_fmat->Info().num_row_));
    }
-    if (page->matrix.n_rows == p_fmat->Info().num_row_) {
+    if (page->n_rows == p_fmat->Info().num_row_) {
      FinalisePositionInPage(page, d_nodes);
    } else {
      for (auto& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
@@ -699,7 +697,7 @@ struct GPUHistMakerDevice {
  }

  void FinalisePositionInPage(EllpackPageImpl* page, const common::Span<RegTree::Node> d_nodes) {
-    auto d_matrix = page->matrix;
+    auto d_matrix = page->GetDeviceAccessor(device_id);
    row_partitioner->FinalisePosition(
        [=] __device__(size_t row_id, int position) {
      if (!d_matrix.IsInRange(row_id)) {
@@ -765,7 +763,7 @@ struct GPUHistMakerDevice {
    reducer->AllReduceSum(
        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-        page->matrix.info.n_bins * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
+        page->cuts_.TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
    reducer->Synchronize();

    monitor.StopCuda("AllReduce");
@@ -954,14 +952,14 @@ inline void GPUHistMakerDevice<GradientSumT>::InitHistogram() {
  // check if we can use shared memory for building histograms
  // (assuming atleast we need 2 CTAs per SM to maintain decent latency
  // hiding)
-  auto histogram_size = sizeof(GradientSumT) * page->matrix.info.n_bins;
+  auto histogram_size = sizeof(GradientSumT) * page->cuts_.TotalBins();
  auto max_smem = dh::MaxSharedMemory(device_id);
  if (histogram_size <= max_smem) {
    use_shared_memory_histograms = true;
  }

  // Init histogram
-  hist.Init(device_id, page->matrix.info.n_bins);
+  hist.Init(device_id, page->cuts_.TotalBins());
 }

 template <typename GradientSumT>
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -19,23 +19,19 @@ TEST(EllpackPage, EmptyDMatrix) {
  auto dmat = *CreateDMatrix(kNRows, kNCols, kSparsity);
  auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin, kGpuBatchNRows}).begin();
  auto impl = page.Impl();
-  ASSERT_EQ(impl->matrix.info.feature_segments.size(), 1);
-  ASSERT_EQ(impl->matrix.info.min_fvalue.size(), 0);
-  ASSERT_EQ(impl->matrix.info.gidx_fvalue_map.size(), 0);
-  ASSERT_EQ(impl->matrix.info.row_stride, 0);
-  ASSERT_EQ(impl->matrix.info.n_bins, 0);
-  ASSERT_EQ(impl->gidx_buffer.size(), 4);
+  ASSERT_EQ(impl->row_stride, 0);
+  ASSERT_EQ(impl->cuts_.TotalBins(), 0);
+  ASSERT_EQ(impl->gidx_buffer.Size(), 4);
 }

 TEST(EllpackPage, BuildGidxDense) {
  int constexpr kNRows = 16, kNCols = 8;
  auto page = BuildEllpackPage(kNRows, kNCols);

-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
-  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
+  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
+  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), page->NumSymbols());

-  ASSERT_EQ(page->matrix.info.row_stride, kNCols);
+  ASSERT_EQ(page->row_stride, kNCols);

  std::vector<uint32_t> solution = {
    0, 3, 8,  9, 14, 17, 20, 21,
@@ -64,11 +60,10 @@ TEST(EllpackPage, BuildGidxSparse) {
  int constexpr kNRows = 16, kNCols = 8;
  auto page = BuildEllpackPage(kNRows, kNCols, 0.9f);

-  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&h_gidx_buffer, page->gidx_buffer);
+  std::vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
  common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);

-  ASSERT_LE(page->matrix.info.row_stride, 3);
+  ASSERT_LE(page->row_stride, 3);

  // row_stride = 3, 16 rows, 48 entries for ELLPack
  std::vector<uint32_t> solution = {
@@ -76,16 +71,16 @@ TEST(EllpackPage, BuildGidxSparse) {
    24, 24, 24, 24, 24,  5, 24, 24,  0, 16, 24, 15, 24, 24, 24, 24,
    24,  7, 14, 16,  4, 24, 24, 24, 24, 24,  9, 24, 24,  1, 24, 24
  };
-  for (size_t i = 0; i < kNRows * page->matrix.info.row_stride; ++i) {
+  for (size_t i = 0; i < kNRows * page->row_stride; ++i) {
    ASSERT_EQ(solution[i], gidx[i]);
  }
 }

 struct ReadRowFunction {
-  EllpackMatrix matrix;
+  EllpackDeviceAccessor matrix;
  int row;
  bst_float* row_data_d;
-  ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d)
+  ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d)
      : matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}

  __device__ void operator()(size_t col) {
@@ -110,7 +105,8 @@ TEST(EllpackPage, Copy) {
  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();

  // Create an empty result page.
-  EllpackPageImpl result(0, page->matrix.info, kRows);
+  EllpackPageImpl result(0, page->cuts_, page->is_dense, page->row_stride,
+                         kRows);

  // Copy batch pages into the result page.
  size_t offset = 0;
@@ -126,13 +122,13 @@ TEST(EllpackPage, Copy) {
  std::vector<bst_float> row_result(kCols);
  for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
    auto impl = page.Impl();
-    EXPECT_EQ(impl->matrix.base_rowid, current_row);
+    EXPECT_EQ(impl->base_rowid, current_row);

    for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
      thrust::copy(row_d.begin(), row_d.end(), row.begin());

-      dh::LaunchN(0, kCols, ReadRowFunction(result.matrix, current_row, row_result_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get()));
      thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

      EXPECT_EQ(row, row_result);
@@ -155,7 +151,8 @@ TEST(EllpackPage, Compact) {
  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();

  // Create an empty result page.
-  EllpackPageImpl result(0, page->matrix.info, kCompactedRows);
+  EllpackPageImpl result(0, page->cuts_, page->is_dense, page->row_stride,
+                         kCompactedRows);

  // Compact batch pages into the result page.
  std::vector<size_t> row_indexes_h {
@@ -174,7 +171,7 @@ TEST(EllpackPage, Compact) {
  std::vector<bst_float> row_result(kCols);
  for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
    auto impl = page.Impl();
-    EXPECT_EQ(impl->matrix.base_rowid, current_row);
+    EXPECT_EQ(impl->base_rowid, current_row);

    for (size_t i = 0; i < impl->Size(); i++) {
      size_t compacted_row = row_indexes_h[current_row];
@@ -183,11 +180,12 @@ TEST(EllpackPage, Compact) {
        continue;
      }

-      dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::safe_cuda (cudaDeviceSynchronize());
      thrust::copy(row_d.begin(), row_d.end(), row.begin());

      dh::LaunchN(0, kCols,
-                  ReadRowFunction(result.matrix, compacted_row, row_result_d.data().get()));
+                  ReadRowFunction(result.GetDeviceAccessor(0), compacted_row, row_result_d.data().get()));
      thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

      EXPECT_EQ(row, row_result);
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -3,6 +3,7 @@
 #include <dmlc/filesystem.h>
 #include "../helpers.h"
 #include "../../../src/common/compressed_iterator.h"
+#include "../../../src/data/ellpack_page.cuh"

 namespace xgboost {

@@ -58,31 +59,29 @@ TEST(SparsePageDMatrix, EllpackPageContent) {

  BatchParam param{0, 2, 0, 0};
  auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
-  EXPECT_EQ(impl->matrix.base_rowid, 0);
-  EXPECT_EQ(impl->matrix.n_rows, kRows);
-  EXPECT_FALSE(impl->matrix.info.is_dense);
-  EXPECT_EQ(impl->matrix.info.row_stride, 2);
-  EXPECT_EQ(impl->matrix.info.n_bins, 4);
+  EXPECT_EQ(impl->base_rowid, 0);
+  EXPECT_EQ(impl->n_rows, kRows);
+  EXPECT_FALSE(impl->is_dense);
+  EXPECT_EQ(impl->row_stride, 2);
+  EXPECT_EQ(impl->cuts_.TotalBins(), 4);

  auto impl_ext = (*dmat_ext->GetBatches<EllpackPage>(param).begin()).Impl();
-  EXPECT_EQ(impl_ext->matrix.base_rowid, 0);
-  EXPECT_EQ(impl_ext->matrix.n_rows, kRows);
-  EXPECT_FALSE(impl_ext->matrix.info.is_dense);
-  EXPECT_EQ(impl_ext->matrix.info.row_stride, 2);
-  EXPECT_EQ(impl_ext->matrix.info.n_bins, 4);
+  EXPECT_EQ(impl_ext->base_rowid, 0);
+  EXPECT_EQ(impl_ext->n_rows, kRows);
+  EXPECT_FALSE(impl_ext->is_dense);
+  EXPECT_EQ(impl_ext->row_stride, 2);
+  EXPECT_EQ(impl_ext->cuts_.TotalBins(), 4);

-  std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.size());
-  std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&buffer, impl->gidx_buffer);
-  dh::CopyDeviceSpanToVector(&buffer_ext, impl_ext->gidx_buffer);
+  std::vector<common::CompressedByteT> buffer(impl->gidx_buffer.HostVector());
+  std::vector<common::CompressedByteT> buffer_ext(impl_ext->gidx_buffer.HostVector());
  EXPECT_EQ(buffer, buffer_ext);
 }

 struct ReadRowFunction {
-  EllpackMatrix matrix;
+  EllpackDeviceAccessor matrix;
  int row;
  bst_float* row_data_d;
-  ReadRowFunction(EllpackMatrix matrix, int row, bst_float* row_data_d)
+  ReadRowFunction(EllpackDeviceAccessor matrix, int row, bst_float* row_data_d)
      : matrix(std::move(matrix)), row(row), row_data_d(row_data_d) {}

  __device__ void operator()(size_t col) {
@@ -110,8 +109,8 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {

  BatchParam param{0, kMaxBins, 0, kPageSize};
  auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
-  EXPECT_EQ(impl->matrix.base_rowid, 0);
-  EXPECT_EQ(impl->matrix.n_rows, kRows);
+  EXPECT_EQ(impl->base_rowid, 0);
+  EXPECT_EQ(impl->n_rows, kRows);

  size_t current_row = 0;
  thrust::device_vector<bst_float> row_d(kCols);
@@ -120,13 +119,13 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
  std::vector<bst_float> row_ext(kCols);
  for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
    auto impl_ext = page.Impl();
-    EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
+    EXPECT_EQ(impl_ext->base_rowid, current_row);

    for (size_t i = 0; i < impl_ext->Size(); i++) {
-      dh::LaunchN(0, kCols, ReadRowFunction(impl->matrix, current_row, row_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
      thrust::copy(row_d.begin(), row_d.end(), row.begin());

-      dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->matrix, current_row, row_ext_d.data().get()));
+      dh::LaunchN(0, kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get()));
      thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());

      EXPECT_EQ(row, row_ext);
@@ -155,8 +154,8 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
  size_t current_row = 0;
  for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
    auto impl_ext = page.Impl();
-    EXPECT_EQ(impl_ext->matrix.base_rowid, current_row);
-    current_row += impl_ext->matrix.n_rows;
+    EXPECT_EQ(impl_ext->base_rowid, current_row);
+    current_row += impl_ext->n_rows;
  }
 }

--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -244,13 +244,13 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 public:
  using SuperT = common::HistogramCuts;
  void SetValues(std::vector<float> cuts) {
-    SuperT::cut_values_ = std::move(cuts);
+    SuperT::cut_values_.HostVector() = std::move(cuts);
  }
  void SetPtrs(std::vector<uint32_t> ptrs) {
-    SuperT::cut_ptrs_ = std::move(ptrs);
+    SuperT::cut_ptrs_.HostVector() = std::move(ptrs);
  }
  void SetMins(std::vector<float> mins) {
-    SuperT::min_vals_ = std::move(mins);
+    SuperT::min_vals_.HostVector() = std::move(mins);
  }
 };
 }  //  anonymous namespace
@@ -279,10 +279,8 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
    row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
  }

-  auto page = std::unique_ptr<EllpackPageImpl>(new EllpackPageImpl(dmat->get(), {0, 256, 0}));
-  page->InitInfo(0, (*dmat)->IsDense(), row_stride, cmat);
-  page->InitCompressedData(0, n_rows);
-  page->CreateHistIndices(0, batch, RowStateOnDevice(batch.Size(), batch.Size()));
+  auto page = std::unique_ptr<EllpackPageImpl>(
+      new EllpackPageImpl(0, cmat, batch, (*dmat)->IsDense(), row_stride));

  delete dmat;

--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -3,6 +3,7 @@
 #include "../../../../src/data/ellpack_page.cuh"
 #include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
 #include "../../helpers.h"
+#include "dmlc/filesystem.h"

 namespace xgboost {
 namespace tree {
@@ -29,7 +30,7 @@ void VerifySampling(size_t page_size,
  BatchParam param{0, 256, 0, page_size};
  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
  if (page_size != 0) {
-    EXPECT_NE(page->matrix.n_rows, kRows);
+    EXPECT_NE(page->n_rows, kRows);
  }

  GradientBasedSampler sampler(page, kRows, param, subsample, sampling_method);
@@ -37,11 +38,11 @@ void VerifySampling(size_t page_size,

  if (fixed_size_sampling) {
    EXPECT_EQ(sample.sample_rows, kRows);
-    EXPECT_EQ(sample.page->matrix.n_rows, kRows);
+    EXPECT_EQ(sample.page->n_rows, kRows);
    EXPECT_EQ(sample.gpair.size(), kRows);
  } else {
-    EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.016f);
-    EXPECT_NEAR(sample.page->matrix.n_rows, sample_rows, kRows * 0.016f);
+    EXPECT_NEAR(sample.sample_rows, sample_rows, kRows * 0.016);
+    EXPECT_NEAR(sample.page->n_rows, sample_rows, kRows * 0.016f);
    EXPECT_NEAR(sample.gpair.size(), sample_rows, kRows * 0.016f);
  }

@@ -83,7 +84,7 @@ TEST(GradientBasedSampler, NoSampling_ExternalMemory) {

  BatchParam param{0, 256, 0, kPageSize};
  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
-  EXPECT_NE(page->matrix.n_rows, kRows);
+  EXPECT_NE(page->n_rows, kRows);

  GradientBasedSampler sampler(page, kRows, param, kSubsample, TrainParam::kUniform);
  auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get());
@@ -91,21 +92,19 @@ TEST(GradientBasedSampler, NoSampling_ExternalMemory) {
  EXPECT_EQ(sample.sample_rows, kRows);
  EXPECT_EQ(sample.gpair.size(), gpair.Size());
  EXPECT_EQ(sample.gpair.data(), gpair.DevicePointer());
-  EXPECT_EQ(sampled_page->matrix.n_rows, kRows);
+  EXPECT_EQ(sampled_page->n_rows, kRows);

-  std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&buffer, sampled_page->gidx_buffer);
+  std::vector<common::CompressedByteT> buffer(sampled_page->gidx_buffer.HostVector());
  common::CompressedIterator<common::CompressedByteT>
-      ci(buffer.data(), sampled_page->matrix.info.NumSymbols());
+      ci(buffer.data(), sampled_page->NumSymbols());

  size_t offset = 0;
  for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
    auto page = batch.Impl();
-    std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.size());
-    dh::CopyDeviceSpanToVector(&page_buffer, page->gidx_buffer);
+    std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.HostVector());
    common::CompressedIterator<common::CompressedByteT>
-        page_ci(page_buffer.data(), page->matrix.info.NumSymbols());
-    size_t num_elements = page->matrix.n_rows * page->matrix.info.row_stride;
+        page_ci(page_buffer.data(), page->NumSymbols());
+    size_t num_elements = page->n_rows * page->row_stride;
    for (size_t i = 0; i < num_elements; i++) {
      EXPECT_EQ(ci[i + offset], page_ci[i]);
    }
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -27,7 +27,7 @@ void TestDeterminsticHistogram() {
    gpair.SetDevice(0);

    auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
-    BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
+    BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
                           d_histogram, rounding, true);

    for (size_t i = 0; i < kRounds; ++i) {
@@ -35,7 +35,7 @@ void TestDeterminsticHistogram() {
      auto d_histogram = dh::ToSpan(new_histogram);

      auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
-      BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
                             d_histogram, rounding, true);

      for (size_t j = 0; j < new_histogram.size(); ++j) {
@@ -50,7 +50,7 @@ void TestDeterminsticHistogram() {
      auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
      gpair.SetDevice(0);
      dh::device_vector<Gradient> baseline(kBins * kCols);
-      BuildGradientHistogram(page->matrix, gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
                             dh::ToSpan(baseline), rounding, true);
      for (size_t i = 0; i < baseline.size(); ++i) {
        EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -97,12 +97,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  }
  gpair.SetDevice(0);

-  thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.size());
+  thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());

-  common::CompressedByteT* d_gidx_buffer_ptr = page->gidx_buffer.data();
-  dh::safe_cuda(cudaMemcpy(h_gidx_buffer.data(), d_gidx_buffer_ptr,
-                           sizeof(common::CompressedByteT) * page->gidx_buffer.size(),
-                           cudaMemcpyDeviceToHost));

  maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
  maker.hist.AllocateHistogram(0);
@@ -196,15 +192,10 @@ TEST(GpuHist, EvaluateSplits) {
  auto cmat = GetHostCutMatrix();

  // Copy cut matrix to device.
-  maker.ba.Allocate(0,
-                    &(page->matrix.info.feature_segments), cmat.Ptrs().size(),
-                    &(page->matrix.info.min_fvalue), cmat.MinValues().size(),
-                    &(page->matrix.info.gidx_fvalue_map), 24,
-                    &(maker.monotone_constraints), kNCols);
-  dh::CopyVectorToDeviceSpan(page->matrix.info.feature_segments, cmat.Ptrs());
-  dh::CopyVectorToDeviceSpan(page->matrix.info.gidx_fvalue_map, cmat.Values());
-  dh::CopyVectorToDeviceSpan(maker.monotone_constraints, param.monotone_constraints);
-  dh::CopyVectorToDeviceSpan(page->matrix.info.min_fvalue, cmat.MinValues());
+  page->cuts_ = cmat;
+  maker.ba.Allocate(0, &(maker.monotone_constraints), kNCols);
+  dh::CopyVectorToDeviceSpan(maker.monotone_constraints,
+                             param.monotone_constraints);

  // Initialize GPUHistMakerDevice::hist
  maker.hist.Init(0, (max_bins - 1) * kNCols);
@@ -274,15 +265,13 @@ void TestHistogramIndexImpl() {
  // Extract the device maker from the histogram makers and from that its compressed
  // histogram index
  const auto &maker = hist_maker.maker;
-  std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&h_gidx_buffer, maker->page->gidx_buffer);
+  std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());

  const auto &maker_ext = hist_maker_ext.maker;
-  std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.size());
-  dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, maker_ext->page->gidx_buffer);
+  std::vector<common::CompressedByteT> h_gidx_buffer_ext(maker_ext->page->gidx_buffer.HostVector());

-  ASSERT_EQ(maker->page->matrix.info.n_bins, maker_ext->page->matrix.info.n_bins);
-  ASSERT_EQ(maker->page->gidx_buffer.size(), maker_ext->page->gidx_buffer.size());
+  ASSERT_EQ(maker->page->cuts_.TotalBins(), maker_ext->page->cuts_.TotalBins());
+  ASSERT_EQ(maker->page->gidx_buffer.Size(), maker_ext->page->gidx_buffer.Size());

 }