Dmatrix refactor stage 1 (#3301)

* Use sparse page as singular CSR matrix representation * Simplify dmatrix methods * Reduce statefullness of batch iterators * BREAKING CHANGE: Remove prob_buffer_row parameter. Users are instead recommended to sample their dataset as a preprocessing step before using XGBoost.
2018-06-07 10:25:58 +12:00
parent 286dccb8e8
commit a96039141a
47 changed files with 650 additions and 1036 deletions
--- a/src/tree/updater_basemaker-inl.h
+++ b/src/tree/updater_basemaker-inl.h
@@ -43,13 +43,12 @@ class BaseMaker: public TreeUpdater {
      std::fill(fminmax_.begin(), fminmax_.end(),
                -std::numeric_limits<bst_float>::max());
      // start accumulating statistics
-      dmlc::DataIter<ColBatch>* iter = p_fmat->ColIterator();
+      auto iter = p_fmat->ColIterator();
      iter->BeforeFirst();
      while (iter->Next()) {
-        const ColBatch& batch = iter->Value();
-        for (bst_uint i = 0; i < batch.size; ++i) {
-          const bst_uint fid = batch.col_index[i];
-          const ColBatch::Inst& c = batch[i];
+        auto batch = iter->Value();
+        for (bst_uint fid = 0; fid < batch.Size(); ++fid) {
+           auto c = batch[fid];
          if (c.length != 0) {
            fminmax_[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax_[fid * 2 + 0]);
            fminmax_[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax_[fid * 2 + 1]);
@@ -104,7 +103,7 @@ class BaseMaker: public TreeUpdater {
  // ------static helper functions ------
  // helper function to get to next level of the tree
  /*! \brief this is  helper function for row based data*/
-  inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
+  inline static int NextLevel(const SparsePage::Inst &inst, const RegTree &tree, int nid) {
    const RegTree::Node &n = tree[nid];
    bst_uint findex = n.SplitIndex();
    for (unsigned i = 0; i < inst.length; ++i) {
@@ -244,12 +243,10 @@ class BaseMaker: public TreeUpdater {
   * \param tree the regression tree structure
   */
  inline void CorrectNonDefaultPositionByBatch(
-      const ColBatch& batch,
-      const std::vector<bst_uint> &sorted_split_set,
+      const SparsePage &batch, const std::vector<bst_uint> &sorted_split_set,
      const RegTree &tree) {
-    for (size_t i = 0; i < batch.size; ++i) {
-      ColBatch::Inst col = batch[i];
-      const bst_uint fid = batch.col_index[i];
+    for (size_t fid = 0; fid < batch.Size(); ++fid) {
+      auto col = batch[fid];
      auto it = std::lower_bound(sorted_split_set.begin(), sorted_split_set.end(), fid);

      if (it != sorted_split_set.end() && *it == fid) {
@@ -306,12 +303,11 @@ class BaseMaker: public TreeUpdater {
                                        const RegTree &tree) {
    std::vector<unsigned> fsplits;
    this->GetSplitSet(nodes, tree, &fsplits);
-    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+    auto iter = p_fmat->ColIterator();
    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        ColBatch::Inst col = batch[i];
-        const bst_uint fid = batch.col_index[i];
+      auto batch = iter->Value();
+      for (auto fid : fsplits) {
+        auto col = batch[fid];
        const auto ndata = static_cast<bst_omp_uint>(col.length);
        #pragma omp parallel for schedule(static)
        for (bst_omp_uint j = 0; j < ndata; ++j) {
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -252,7 +252,7 @@ class ColMaker: public TreeUpdater {
    }
    // parallel find the best split of current fid
    // this function does not support nested functions
-    inline void ParallelFindSplit(const ColBatch::Inst &col,
+    inline void ParallelFindSplit(const SparsePage::Inst &col,
                                  bst_uint fid,
                                  const DMatrix &fmat,
                                  const std::vector<GradientPair> &gpair) {
@@ -439,8 +439,8 @@ class ColMaker: public TreeUpdater {
      }
    }
    // same as EnumerateSplit, with cacheline prefetch optimization
-    inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
-                                       const ColBatch::Entry *end,
+    inline void EnumerateSplitCacheOpt(const Entry *begin,
+                                       const Entry *end,
                                       int d_step,
                                       bst_uint fid,
                                       const std::vector<GradientPair> &gpair,
@@ -457,18 +457,18 @@ class ColMaker: public TreeUpdater {
      int buf_position[kBuffer] = {};
      GradientPair buf_gpair[kBuffer] = {};
      // aligned ending position
-      const ColBatch::Entry *align_end;
+      const Entry *align_end;
      if (d_step > 0) {
        align_end = begin + (end - begin) / kBuffer * kBuffer;
      } else {
        align_end = begin - (begin - end) / kBuffer * kBuffer;
      }
      int i;
-      const ColBatch::Entry *it;
+      const Entry *it;
      const int align_step = d_step * kBuffer;
      // internal cached loop
      for (it = begin; it != align_end; it += align_step) {
-        const ColBatch::Entry *p;
+        const Entry *p;
        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
          buf_position[i] = position_[p->index];
          buf_gpair[i] = gpair[p->index];
@@ -519,8 +519,8 @@ class ColMaker: public TreeUpdater {
    }

    // enumerate the split values of specific feature
-    inline void EnumerateSplit(const ColBatch::Entry *begin,
-                               const ColBatch::Entry *end,
+    inline void EnumerateSplit(const Entry *begin,
+                               const Entry *end,
                               int d_step,
                               bst_uint fid,
                               const std::vector<GradientPair> &gpair,
@@ -538,7 +538,7 @@ class ColMaker: public TreeUpdater {
      }
      // left statistics
      TStats c(param_);
-      for (const ColBatch::Entry *it = begin; it != end; it += d_step) {
+      for (const Entry *it = begin; it != end; it += d_step) {
        const bst_uint ridx = it->index;
        const int nid = position_[ridx];
        if (nid < 0) continue;
@@ -602,25 +602,26 @@ class ColMaker: public TreeUpdater {
    }

    // update the solution candidate
-    virtual void UpdateSolution(const ColBatch& batch,
-                                const std::vector<GradientPair>& gpair,
-                                const DMatrix& fmat) {
+    virtual void UpdateSolution(const SparsePage &batch,
+                                const std::vector<bst_uint> &feat_set,
+                                const std::vector<GradientPair> &gpair,
+                                const DMatrix &fmat) {
      const MetaInfo& info = fmat.Info();
      // start enumeration
-      const auto nsize = static_cast<bst_omp_uint>(batch.size);
+      const auto num_features = static_cast<bst_omp_uint>(feat_set.size());
      #if defined(_OPENMP)
-      const int batch_size = std::max(static_cast<int>(nsize / this->nthread_ / 32), 1);
+      const int batch_size = std::max(static_cast<int>(num_features / this->nthread_ / 32), 1);
      #endif
      int poption = param_.parallel_option;
      if (poption == 2) {
-        poption = static_cast<int>(nsize) * 2 < this->nthread_ ? 1 : 0;
+        poption = static_cast<int>(num_features) * 2 < this->nthread_ ? 1 : 0;
      }
      if (poption == 0) {
        #pragma omp parallel for schedule(dynamic, batch_size)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          const bst_uint fid = batch.col_index[i];
+        for (bst_omp_uint i = 0; i < num_features; ++i) {
+          int fid = feat_set[i];
          const int tid = omp_get_thread_num();
-          const ColBatch::Inst c = batch[i];
+          auto c = batch[fid];
          const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue;
          if (param_.NeedForwardSearch(fmat.GetColDensity(fid), ind)) {
            this->EnumerateSplit(c.data, c.data + c.length, +1,
@@ -632,8 +633,8 @@ class ColMaker: public TreeUpdater {
          }
        }
      } else {
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          this->ParallelFindSplit(batch[i], batch.col_index[i],
+        for (bst_omp_uint fid = 0; fid < num_features; ++fid) {
+          this->ParallelFindSplit(batch[fid], fid,
                                  fmat, gpair);
        }
      }
@@ -653,9 +654,9 @@ class ColMaker: public TreeUpdater {
            << "colsample_bylevel cannot be zero.";
        feat_set.resize(n);
      }
-      dmlc::DataIter<ColBatch>* iter = p_fmat->ColIterator(feat_set);
+      auto iter = p_fmat->ColIterator();
      while (iter->Next()) {
-        this->UpdateSolution(iter->Value(), gpair, *p_fmat);
+        this->UpdateSolution(iter->Value(), feat_set, gpair, *p_fmat);
      }
      // after this each thread's stemp will get the best candidates, aggregate results
      this->SyncBestSolution(qexpand);
@@ -730,12 +731,11 @@ class ColMaker: public TreeUpdater {
      }
      std::sort(fsplits.begin(), fsplits.end());
      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      auto iter = p_fmat->ColIterator();
      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        for (size_t i = 0; i < batch.size; ++i) {
-          ColBatch::Inst col = batch[i];
-          const bst_uint fid = batch.col_index[i];
+        auto batch = iter->Value();
+        for (auto fid : fsplits) {
+          auto col = batch[fid];
          const auto ndata = static_cast<bst_omp_uint>(col.length);
          #pragma omp parallel for schedule(static)
          for (bst_omp_uint j = 0; j < ndata; ++j) {
@@ -859,12 +859,11 @@ class DistColMaker : public ColMaker<TStats, TConstraint> {
            boolmap_[j] = 0;
        }
      }
-      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      auto iter = p_fmat->ColIterator();
      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        for (size_t i = 0; i < batch.size; ++i) {
-          ColBatch::Inst col = batch[i];
-          const bst_uint fid = batch.col_index[i];
+        auto batch = iter->Value();
+        for (auto fid : fsplits) {
+          auto col = batch[fid];
          const auto ndata = static_cast<bst_omp_uint>(col.length);
          #pragma omp parallel for schedule(static)
          for (bst_omp_uint j = 0; j < ndata; ++j) {
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -661,16 +661,15 @@ class GPUMaker : public TreeUpdater {
    // in case you end up with a DMatrix having no column access
    // then make sure to enable that before copying the data!
    if (!dmat->HaveColAccess(true)) {
-      const std::vector<bool> enable(nCols, true);
-      dmat->InitColAccess(enable, 1, nRows, true);
+      dmat->InitColAccess(nRows, true);
    }
-    dmlc::DataIter<ColBatch>* iter = dmat->ColIterator();
+    auto iter = dmat->ColIterator();
    iter->BeforeFirst();
    while (iter->Next()) {
-      const ColBatch& batch = iter->Value();
-      for (int i = 0; i < batch.size; i++) {
-        const ColBatch::Inst& col = batch[i];
-        for (const ColBatch::Entry* it = col.data; it != col.data + col.length;
+      auto batch = iter->Value();
+      for (int i = 0; i < batch.Size(); i++) {
+        auto col = batch[i];
+        for (const Entry* it = col.data; it != col.data + col.length;
             it++) {
          int inst_id = static_cast<int>(it->index);
          fval->push_back(it->fvalue);
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -250,7 +250,7 @@ __device__ int upper_bound(const float* __restrict__ cuts, int n, float v) {
 __global__ void compress_bin_ellpack_k
 (common::CompressedBufferWriter wr, common::CompressedByteT* __restrict__ buffer,
 const size_t* __restrict__ row_ptrs,
- const RowBatch::Entry* __restrict__ entries,
+ const Entry* __restrict__ entries,
 const float* __restrict__ cuts, const size_t* __restrict__ cut_rows,
 size_t base_row, size_t n_rows, size_t row_ptr_begin, size_t row_stride,
 unsigned int null_gidx_value) {
@@ -261,7 +261,7 @@ __global__ void compress_bin_ellpack_k
  int row_size = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
  unsigned int bin = null_gidx_value;
  if (ifeature < row_size) {
-    RowBatch::Entry entry = entries[row_ptrs[irow] - row_ptr_begin + ifeature];
+    Entry entry = entries[row_ptrs[irow] - row_ptr_begin + ifeature];
    int feature = entry.index;
    float fvalue = entry.fvalue;
    const float *feature_cuts = &cuts[cut_rows[feature]];
@@ -332,7 +332,7 @@ struct DeviceShard {
      param(param),
      prediction_cache_initialised(false) {}

-  void Init(const common::HistCutMatrix& hmat, const RowBatch& row_batch) {
+  void Init(const common::HistCutMatrix& hmat, const SparsePage& row_batch) {
    // copy cuts to the GPU
    dh::safe_cuda(cudaSetDevice(device_idx));
    thrust::device_vector<float> cuts_d(hmat.cut);
@@ -340,7 +340,7 @@ struct DeviceShard {

    // find the maximum row size
    thrust::device_vector<size_t> row_ptr_d(
-        row_batch.ind_ptr + row_begin_idx, row_batch.ind_ptr + row_end_idx + 1);
+        &row_batch.offset[row_begin_idx], &row_batch.offset[row_end_idx + 1]);

    auto row_iter = row_ptr_d.begin();
    auto get_size = [=] __device__(size_t row) {
@@ -369,11 +369,11 @@ struct DeviceShard {
    // bin and compress entries in batches of rows
    // use no more than 1/16th of GPU memory per batch
    size_t gpu_batch_nrows = dh::TotalMemory(device_idx) /
-      (16 * row_stride * sizeof(RowBatch::Entry));
+      (16 * row_stride * sizeof(Entry));
    if (gpu_batch_nrows > n_rows) {
      gpu_batch_nrows = n_rows;
    }
-    thrust::device_vector<RowBatch::Entry> entries_d(gpu_batch_nrows * row_stride);
+    thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
    size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);
    for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
      size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
@@ -383,13 +383,13 @@ struct DeviceShard {
      }
      size_t batch_nrows = batch_row_end - batch_row_begin;
      size_t n_entries =
-        row_batch.ind_ptr[row_begin_idx + batch_row_end] -
-        row_batch.ind_ptr[row_begin_idx + batch_row_begin];
+        row_batch.offset[row_begin_idx + batch_row_end] -
+        row_batch.offset[row_begin_idx + batch_row_begin];
      dh::safe_cuda
        (cudaMemcpy
         (entries_d.data().get(),
-          &row_batch.data_ptr[row_batch.ind_ptr[row_begin_idx + batch_row_begin]],
-          n_entries * sizeof(RowBatch::Entry), cudaMemcpyDefault));
+          &row_batch.data[row_batch.offset[row_begin_idx + batch_row_begin]],
+          n_entries * sizeof(Entry), cudaMemcpyDefault));
      dim3 block3(32, 8, 1);
      dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
                 dh::DivRoundUp(row_stride, block3.y), 1);
@@ -398,7 +398,7 @@ struct DeviceShard {
         row_ptr_d.data().get() + batch_row_begin,
         entries_d.data().get(), cuts_d.data().get(), cut_row_ptrs_d.data().get(),
         batch_row_begin, batch_nrows,
-         row_batch.ind_ptr[row_begin_idx + batch_row_begin],
+         row_batch.offset[row_begin_idx + batch_row_begin],
         row_stride, null_gidx_value);

      dh::safe_cuda(cudaGetLastError());
@@ -702,10 +702,10 @@ class GPUHistMaker : public TreeUpdater {

    monitor_.Start("BinningCompression", device_list_);
    {
-      dmlc::DataIter<RowBatch>* iter = dmat->RowIterator();
+      dmlc::DataIter<SparsePage>* iter = dmat->RowIterator();
      iter->BeforeFirst();
      CHECK(iter->Next()) << "Empty batches are not supported";
-      const RowBatch& batch = iter->Value();
+      const SparsePage& batch = iter->Value();
      // Create device shards
      dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
          shard = std::unique_ptr<DeviceShard>
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -344,17 +344,18 @@ class CQHistMaker: public HistMaker<TStats> {
    {
      thread_hist_.resize(omp_get_max_threads());
      // start accumulating statistics
-      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(fset);
+      auto iter = p_fmat->ColIterator();
      iter->BeforeFirst();
      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
+        auto batch = iter->Value();
        // start enumeration
-        const auto nsize = static_cast<bst_omp_uint>(batch.size);
+        const auto nsize = static_cast<bst_omp_uint>(fset.size());
        #pragma omp parallel for schedule(dynamic, 1)
        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          int offset = feat2workindex_[batch.col_index[i]];
+          int fid = fset[i];
+          int offset = feat2workindex_[fid];
          if (offset >= 0) {
-            this->UpdateHistCol(gpair, batch[i], info, tree,
+            this->UpdateHistCol(gpair, batch[fid], info, tree,
                                fset, offset,
                                &thread_hist_[omp_get_thread_num()]);
          }
@@ -425,20 +426,20 @@ class CQHistMaker: public HistMaker<TStats> {
      work_set_.resize(std::unique(work_set_.begin(), work_set_.end()) - work_set_.begin());

      // start accumulating statistics
-      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(work_set_);
+      auto iter = p_fmat->ColIterator();
      iter->BeforeFirst();
      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
+        auto batch = iter->Value();
        // TWOPASS: use the real set + split set in the column iteration.
        this->CorrectNonDefaultPositionByBatch(batch, fsplit_set_, tree);

        // start enumeration
-        const auto nsize = static_cast<bst_omp_uint>(batch.size);
+        const auto nsize = static_cast<bst_omp_uint>(batch.Size());
        #pragma omp parallel for schedule(dynamic, 1)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          int offset = feat2workindex_[batch.col_index[i]];
+        for (bst_omp_uint fid = 0; fid < nsize; ++fid) {
+          int offset = feat2workindex_[fid];
          if (offset >= 0) {
-            this->UpdateSketchCol(gpair, batch[i], tree,
+            this->UpdateSketchCol(gpair, batch[fid], tree,
                                  work_set_size, offset,
                                  &thread_sketch_[omp_get_thread_num()]);
          }
@@ -494,7 +495,7 @@ class CQHistMaker: public HistMaker<TStats> {
  }

  inline void UpdateHistCol(const std::vector<GradientPair> &gpair,
-                            const ColBatch::Inst &c,
+                            const SparsePage::Inst &c,
                            const MetaInfo &info,
                            const RegTree &tree,
                            const std::vector<bst_uint> &fset,
@@ -546,7 +547,7 @@ class CQHistMaker: public HistMaker<TStats> {
    }
  }
  inline void UpdateSketchCol(const std::vector<GradientPair> &gpair,
-                              const ColBatch::Inst &c,
+                              const SparsePage::Inst &c,
                              const RegTree &tree,
                              size_t work_set_size,
                              bst_uint offset,
@@ -712,18 +713,18 @@ class GlobalProposalHistMaker: public CQHistMaker<TStats> {
          std::unique(this->work_set_.begin(), this->work_set_.end()) - this->work_set_.begin());

      // start accumulating statistics
-      dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator(this->work_set_);
+      auto iter = p_fmat->ColIterator();
      iter->BeforeFirst();
      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
+        auto batch = iter->Value();
        // TWOPASS: use the real set + split set in the column iteration.
        this->CorrectNonDefaultPositionByBatch(batch, this->fsplit_set_, tree);

        // start enumeration
-        const auto nsize = static_cast<bst_omp_uint>(batch.size);
+        const auto nsize = static_cast<bst_omp_uint>(this->work_set_.size());
        #pragma omp parallel for schedule(dynamic, 1)
        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          int offset = this->feat2workindex_[batch.col_index[i]];
+          int offset = this->feat2workindex_[this->work_set_[i]];
          if (offset >= 0) {
            this->UpdateHistCol(gpair, batch[i], info, tree,
                                fset, offset,
@@ -769,19 +770,19 @@ class QuantileHistMaker: public HistMaker<TStats> {
      sketchs_[i].Init(info.num_row_, this->param_.sketch_eps);
    }
    // start accumulating statistics
-    dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
+    auto iter = p_fmat->RowIterator();
    iter->BeforeFirst();
    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
+      auto batch = iter->Value();
      // parallel convert to column major format
-      common::ParallelGroupBuilder<SparseBatch::Entry>
+      common::ParallelGroupBuilder<Entry>
          builder(&col_ptr_, &col_data_, &thread_col_ptr_);
      builder.InitBudget(tree.param.num_feature, nthread);

-      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.Size());
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
+        SparsePage::Inst inst = batch[i];
        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
        int nid = this->position_[ridx];
        if (nid >= 0) {
@@ -800,13 +801,13 @@ class QuantileHistMaker: public HistMaker<TStats> {
      builder.InitStorage();
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
+        SparsePage::Inst inst = batch[i];
        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
        const int nid = this->position_[ridx];
        if (nid >= 0) {
          for (bst_uint j = 0; j < inst.length; ++j) {
            builder.Push(inst[j].index,
-                         SparseBatch::Entry(nid, inst[j].fvalue),
+                         Entry(nid, inst[j].fvalue),
                         omp_get_thread_num());
          }
        }
@@ -816,7 +817,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
      #pragma omp parallel for schedule(dynamic, 1)
      for (bst_omp_uint k = 0; k < nfeat; ++k) {
        for (size_t i = col_ptr_[k]; i < col_ptr_[k+1]; ++i) {
-          const SparseBatch::Entry &e = col_data_[i];
+          const Entry &e = col_data_[i];
          const int wid = this->node2workindex_[e.index];
          sketchs_[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].GetHess());
        }
@@ -873,7 +874,7 @@ class QuantileHistMaker: public HistMaker<TStats> {
  // local temp column data structure
  std::vector<size_t> col_ptr_;
  // local storage of column data
-  std::vector<SparseBatch::Entry> col_data_;
+  std::vector<Entry> col_data_;
  std::vector<std::vector<size_t> > thread_col_ptr_;
  // per node, per feature sketch
  std::vector<common::WQuantileSketch<bst_float, bst_float> > sketchs_;
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -57,15 +57,15 @@ class TreeRefresher: public TreeUpdater {
    {
      const MetaInfo &info = p_fmat->Info();
      // start accumulating statistics
-      dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
+       auto *iter = p_fmat->RowIterator();
      iter->BeforeFirst();
      while (iter->Next()) {
-        const RowBatch &batch = iter->Value();
-        CHECK_LT(batch.size, std::numeric_limits<unsigned>::max());
-        const auto nbatch = static_cast<bst_omp_uint>(batch.size);
+         auto batch = iter->Value();
+        CHECK_LT(batch.Size(), std::numeric_limits<unsigned>::max());
+        const auto nbatch = static_cast<bst_omp_uint>(batch.Size());
        #pragma omp parallel for schedule(static)
        for (bst_omp_uint i = 0; i < nbatch; ++i) {
-          RowBatch::Inst inst = batch[i];
+          SparsePage::Inst inst = batch[i];
          const int tid = omp_get_thread_num();
          const auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
          RegTree::FVec &feats = fvec_temp[tid];
--- a/src/tree/updater_skmaker.cc
+++ b/src/tree/updater_skmaker.cc
@@ -144,18 +144,18 @@ class SketchMaker: public BaseMaker {
    // number of rows in
    const size_t nrows = p_fmat->BufferedRowset().Size();
    // start accumulating statistics
-    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+    auto iter = p_fmat->ColIterator();
    iter->BeforeFirst();
    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
+      auto batch = iter->Value();
      // start enumeration
-      const auto nsize = static_cast<bst_omp_uint>(batch.size);
+      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
      #pragma omp parallel for schedule(dynamic, 1)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        this->UpdateSketchCol(gpair, batch[i], tree,
+      for (bst_omp_uint fidx = 0; fidx < nsize; ++fidx) {
+        this->UpdateSketchCol(gpair, batch[fidx], tree,
                              node_stats_,
-                              batch.col_index[i],
-                              batch[i].length == nrows,
+                              fidx,
+                              batch[fidx].length == nrows,
                              &thread_sketch_[omp_get_thread_num()]);
      }
    }
@@ -174,7 +174,7 @@ class SketchMaker: public BaseMaker {
  }
  // update sketch information in column fid
  inline void UpdateSketchCol(const std::vector<GradientPair> &gpair,
-                              const ColBatch::Inst &c,
+                              const SparsePage::Inst &c,
                              const RegTree &tree,
                              const std::vector<SKStats> &nstats,
                              bst_uint fid,