[LIBXGBOOST] pass demo running.

2016-01-05 21:49:48 -08:00
parent cee148ed64
commit d75e3ed05d
59 changed files with 1611 additions and 1845 deletions
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -3,7 +3,12 @@
 * \file data.cc
 */
 #include <xgboost/data.h>
+#include <xgboost/logging.h>
 #include <cstring>
+#include "./sparse_batch_page.h"
+#include "./simple_dmatrix.h"
+#include "./simple_csr_source.h"
+#include "../common/io.h"

 namespace xgboost {
 // implementation of inline functions
@@ -83,4 +88,83 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
  }
 }

+
+DMatrix* DMatrix::Load(const std::string& uri,
+                       bool silent,
+                       bool load_row_split,
+                       const std::string& file_format) {
+  std::string fname, cache_file;
+  size_t dlm_pos = uri.find('#');
+  if (dlm_pos != std::string::npos) {
+    cache_file = uri.substr(dlm_pos + 1, uri.length());
+    fname = uri.substr(0, dlm_pos);
+    CHECK_EQ(cache_file.find('#'), std::string::npos)
+        << "Only one `#` is allowed in file path for cache file specification.";
+    if (load_row_split) {
+      std::ostringstream os;
+      os << cache_file << ".r" << rabit::GetRank();
+      cache_file = os.str();
+    }
+  } else {
+    fname = uri;
+  }
+  int partid = 0, npart = 1;
+  if (load_row_split) {
+    partid = rabit::GetRank();
+    npart = rabit::GetWorldSize();
+  }
+
+  // legacy handling of binary data loading
+  if (file_format == "auto" && !load_row_split) {
+    int magic;
+    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
+    common::PeekableInStream is(fi.get());
+     if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
+         magic == data::SimpleCSRSource::kMagic) {
+       std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+       source->LoadBinary(&is);
+       DMatrix* dmat = DMatrix::Create(std::move(source), cache_file);
+       if (!silent) {
+         LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "
+                      << dmat->info().num_nonzero << " entries loaded from " << uri;
+       }
+       return dmat;
+     }
+  }
+
+  std::string ftype = file_format;
+  if (file_format == "auto") ftype = "libsvm";
+  std::unique_ptr<dmlc::Parser<uint32_t> > parser(
+      dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, ftype.c_str()));
+  DMatrix* dmat = DMatrix::Create(parser.get(), cache_file);
+  if (!silent) {
+    LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with "
+                 << dmat->info().num_nonzero << " entries loaded from " << uri;
+  }
+  return dmat;
+}
+
+DMatrix* DMatrix::Create(dmlc::Parser<uint32_t>* parser,
+                         const std::string& cache_prefix) {
+  if (cache_prefix.length() == 0) {
+    std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+    source->CopyFrom(parser);
+    return DMatrix::Create(std::move(source), cache_prefix);
+  } else {
+    LOG(FATAL) << "external memory not yet implemented";
+    return nullptr;
+  }
+}
+
+void DMatrix::SaveToLocalFile(const std::string& fname) {
+  data::SimpleCSRSource source;
+  source.CopyFrom(this);
+  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
+  source.SaveBinary(fo.get());
+}
+
+DMatrix* DMatrix::Create(std::unique_ptr<DataSource>&& source,
+                         const std::string& cache_prefix) {
+  return new data::SimpleDMatrix(std::move(source));
+}
 }  // namespace xgboost
--- a/src/data/simple_csr_source.cc
+++ b/src/data/simple_csr_source.cc
@@ -3,7 +3,7 @@
 * \file simple_csr_source.cc
 */
 #include <dmlc/base.h>
-#include <dmlc/logging.h>
+#include <xgboost/logging.h>
 #include "./simple_csr_source.h"

 namespace xgboost {
@@ -80,7 +80,7 @@ void SimpleCSRSource::SaveBinary(dmlc::Stream* fo) const {
 }

 void SimpleCSRSource::BeforeFirst() {
-  at_first_ = false;
+  at_first_ = true;
 }

 bool SimpleCSRSource::Next() {
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -0,0 +1,265 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file simple_dmatrix.cc
+ * \brief the input data structure for gradient boosting
+ * \author Tianqi Chen
+ */
+#include <xgboost/data.h>
+#include <limits>
+#include <algorithm>
+#include <vector>
+#include "./simple_dmatrix.h"
+#include "../common/random.h"
+#include "../common/group_data.h"
+
+namespace xgboost {
+namespace data {
+
+bool SimpleDMatrix::ColBatchIter::Next() {
+  if (data_ptr_ >= cpages_.size()) return false;
+  data_ptr_ += 1;
+  SparsePage* pcol = cpages_[data_ptr_ - 1].get();
+  batch_.size = col_index_.size();
+  col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
+  for (size_t i = 0; i < col_data_.size(); ++i) {
+    const bst_uint ridx = col_index_[i];
+    col_data_[i] = SparseBatch::Inst
+        (dmlc::BeginPtr(pcol->data) + pcol->offset[ridx],
+         static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
+  }
+  batch_.col_index = dmlc::BeginPtr(col_index_);
+  batch_.col_data = dmlc::BeginPtr(col_data_);
+  return true;
+}
+
+dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator() {
+  size_t ncol = this->info().num_col;
+  col_iter_.col_index_.resize(ncol);
+  for (size_t i = 0; i < ncol; ++i) {
+    col_iter_.col_index_[i] = static_cast<bst_uint>(i);
+  }
+  col_iter_.BeforeFirst();
+  return &col_iter_;
+}
+
+dmlc::DataIter<ColBatch>* SimpleDMatrix::ColIterator(const std::vector<bst_uint>&fset) {
+  size_t ncol = this->info().num_col;
+  col_iter_.col_index_.resize(0);
+  for (size_t i = 0; i < fset.size(); ++i) {
+    if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
+  }
+  col_iter_.BeforeFirst();
+  return &col_iter_;
+}
+
+void SimpleDMatrix::InitColAccess(const std::vector<bool> &enabled,
+                                  float pkeep,
+                                  size_t max_row_perbatch) {
+  if (this->HaveColAccess()) return;
+
+  col_iter_.cpages_.clear();
+  if (info().num_row < max_row_perbatch) {
+    std::unique_ptr<SparsePage> page(new SparsePage());
+    this->MakeOneBatch(enabled, pkeep, page.get());
+    col_iter_.cpages_.push_back(std::move(page));
+  } else {
+    this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
+  }
+  // setup col-size
+  col_size_.resize(info().num_col);
+  std::fill(col_size_.begin(), col_size_.end(), 0);
+  for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
+    SparsePage *pcol = col_iter_.cpages_[i].get();
+    for (size_t j = 0; j < pcol->Size(); ++j) {
+      col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
+    }
+  }
+}
+
+// internal function to make one batch from row iter.
+void SimpleDMatrix::MakeOneBatch(const std::vector<bool>& enabled,
+                                 float pkeep,
+                                 SparsePage *pcol) {
+  // clear rowset
+  buffered_rowset_.clear();
+  // bit map
+  int nthread;
+  std::vector<bool> bmap;
+  #pragma omp parallel
+  {
+    nthread = omp_get_num_threads();
+  }
+
+  pcol->Clear();
+  common::ParallelGroupBuilder<SparseBatch::Entry>
+      builder(&pcol->offset, &pcol->data);
+  builder.InitBudget(info().num_col, nthread);
+  // start working
+  dmlc::DataIter<RowBatch>* iter = this->RowIterator();
+  iter->BeforeFirst();
+  while (iter->Next()) {
+    const RowBatch& batch = iter->Value();
+    bmap.resize(bmap.size() + batch.size, true);
+    std::bernoulli_distribution coin_flip(pkeep);
+    auto& rnd = common::GlobalRandom();
+
+    long batch_size = static_cast<long>(batch.size); // NOLINT(*)
+    for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
+      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+      if (pkeep == 1.0f || coin_flip(rnd)) {
+        buffered_rowset_.push_back(ridx);
+      } else {
+        bmap[i] = false;
+      }
+    }
+    #pragma omp parallel for schedule(static)
+    for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
+      int tid = omp_get_thread_num();
+      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+      if (bmap[ridx]) {
+        RowBatch::Inst inst = batch[i];
+        for (bst_uint j = 0; j < inst.length; ++j) {
+          if (enabled[inst[j].index]) {
+            builder.AddBudget(inst[j].index, tid);
+          }
+        }
+      }
+    }
+  }
+  builder.InitStorage();
+
+  iter->BeforeFirst();
+  while (iter->Next()) {
+    const RowBatch& batch = iter->Value();
+    #pragma omp parallel for schedule(static)
+    for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
+      int tid = omp_get_thread_num();
+      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+      if (bmap[ridx]) {
+        RowBatch::Inst inst = batch[i];
+        for (bst_uint j = 0; j < inst.length; ++j) {
+          if (enabled[inst[j].index]) {
+            builder.Push(inst[j].index,
+                         SparseBatch::Entry((bst_uint)(batch.base_rowid+i),
+                                            inst[j].fvalue), tid);
+          }
+        }
+      }
+    }
+  }
+
+  CHECK_EQ(pcol->Size(), info().num_col);
+  // sort columns
+  bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
+  #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < ncol; ++i) {
+    if (pcol->offset[i] < pcol->offset[i + 1]) {
+      std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
+                dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
+                SparseBatch::Entry::CmpValue);
+    }
+  }
+}
+
+void SimpleDMatrix::MakeManyBatch(const std::vector<bool>& enabled,
+                                  float pkeep,
+                                  size_t max_row_perbatch) {
+  size_t btop = 0;
+  std::bernoulli_distribution coin_flip(pkeep);
+  auto& rnd = common::GlobalRandom();
+  buffered_rowset_.clear();
+  // internal temp cache
+  SparsePage tmp; tmp.Clear();
+  // start working
+  dmlc::DataIter<RowBatch>* iter = this->RowIterator();
+  iter->BeforeFirst();
+
+  while (iter->Next()) {
+    const RowBatch &batch = iter->Value();
+    for (size_t i = 0; i < batch.size; ++i) {
+      bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+      if (pkeep == 1.0f || coin_flip(rnd)) {
+        buffered_rowset_.push_back(ridx);
+        tmp.Push(batch[i]);
+      }
+      if (tmp.Size() >= max_row_perbatch) {
+        std::unique_ptr<SparsePage> page(new SparsePage());
+        this->MakeColPage(tmp.GetRowBatch(0),
+                          dmlc::BeginPtr(buffered_rowset_) + btop,
+                          enabled, page.get());
+        col_iter_.cpages_.push_back(std::move(page));
+        btop = buffered_rowset_.size();
+        tmp.Clear();
+      }
+    }
+  }
+
+  if (tmp.Size() != 0) {
+    std::unique_ptr<SparsePage> page(new SparsePage());
+    this->MakeColPage(tmp.GetRowBatch(0),
+                      dmlc::BeginPtr(buffered_rowset_) + btop,
+                      enabled, page.get());
+    col_iter_.cpages_.push_back(std::move(page));
+  }
+}
+
+// make column page from subset of rowbatchs
+void SimpleDMatrix::MakeColPage(const RowBatch& batch,
+                                const bst_uint* ridx,
+                                const std::vector<bool>& enabled,
+                                SparsePage* pcol) {
+  int nthread;
+  #pragma omp parallel
+  {
+    nthread = omp_get_num_threads();
+    int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
+    if (nthread > max_nthread) {
+      nthread = max_nthread;
+    }
+  }
+  pcol->Clear();
+  common::ParallelGroupBuilder<SparseBatch::Entry>
+      builder(&pcol->offset, &pcol->data);
+  builder.InitBudget(info().num_col, nthread);
+  bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
+  #pragma omp parallel for schedule(static) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < ndata; ++i) {
+    int tid = omp_get_thread_num();
+    RowBatch::Inst inst = batch[i];
+    for (bst_uint j = 0; j < inst.length; ++j) {
+      const SparseBatch::Entry &e = inst[j];
+      if (enabled[e.index]) {
+        builder.AddBudget(e.index, tid);
+      }
+    }
+  }
+  builder.InitStorage();
+  #pragma omp parallel for schedule(static) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < ndata; ++i) {
+    int tid = omp_get_thread_num();
+    RowBatch::Inst inst = batch[i];
+    for (bst_uint j = 0; j < inst.length; ++j) {
+      const SparseBatch::Entry &e = inst[j];
+      builder.Push(e.index,
+                   SparseBatch::Entry(ridx[i], e.fvalue),
+                   tid);
+    }
+  }
+  CHECK_EQ(pcol->Size(), info().num_col);
+  // sort columns
+  bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
+  #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
+  for (bst_omp_uint i = 0; i < ncol; ++i) {
+    if (pcol->offset[i] < pcol->offset[i + 1]) {
+      std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i],
+                dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1],
+                SparseBatch::Entry::CmpValue);
+    }
+  }
+}
+
+bool SimpleDMatrix::SingleColBlock() const {
+  return col_iter_.cpages_.size() <= 1;
+}
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -0,0 +1,119 @@
+/*!
+ * Copyright 2015 by Contributors
+ * \file simple_dmatrix.h
+ * \brief In-memory version of DMatrix.
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_DATA_SIMPLE_DMATRIX_H_
+#define XGBOOST_DATA_SIMPLE_DMATRIX_H_
+
+#include <xgboost/base.h>
+#include <xgboost/data.h>
+#include <vector>
+#include <algorithm>
+#include <cstring>
+#include "./sparse_batch_page.h"
+
+namespace xgboost {
+namespace data {
+
+class SimpleDMatrix : public DMatrix {
+ public:
+  explicit SimpleDMatrix(std::unique_ptr<DataSource>&& source)
+      : source_(std::move(source)) {}
+
+  MetaInfo& info() override {
+    return source_->info;
+  }
+
+  const MetaInfo& info() const override {
+    return source_->info;
+  }
+
+  dmlc::DataIter<RowBatch>* RowIterator() override {
+    dmlc::DataIter<RowBatch>* iter = source_.get();
+    iter->BeforeFirst();
+    return iter;
+  }
+
+  bool HaveColAccess() const override {
+    return col_size_.size() != 0;
+  }
+
+  const std::vector<bst_uint>& buffered_rowset() const override {
+    return buffered_rowset_;
+  }
+
+  size_t GetColSize(size_t cidx) const {
+    return col_size_[cidx];
+  }
+
+  float GetColDensity(size_t cidx) const override {
+    size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
+    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
+  }
+
+  dmlc::DataIter<ColBatch>* ColIterator() override;
+
+  dmlc::DataIter<ColBatch>* ColIterator(const std::vector<bst_uint>& fset) override;
+
+  void InitColAccess(const std::vector<bool>& enabled,
+                     float subsample,
+                     size_t max_row_perbatch) override;
+
+  bool SingleColBlock() const override;
+
+ private:
+  // in-memory column batch iterator.
+  struct ColBatchIter: dmlc::DataIter<ColBatch> {
+   public:
+    ColBatchIter() : data_ptr_(0) {}
+    void BeforeFirst() override {
+      data_ptr_ = 0;
+    }
+    const ColBatch &Value() const override {
+      return batch_;
+    }
+    bool Next() override;
+
+   private:
+    // allow SimpleDMatrix to access it.
+    friend class SimpleDMatrix;
+    // data content
+    std::vector<bst_uint> col_index_;
+    // column content
+    std::vector<ColBatch::Inst> col_data_;
+    // column sparse pages
+    std::vector<std::unique_ptr<SparsePage> > cpages_;
+    // data pointer
+    size_t data_ptr_;
+    // temporal space for batch
+    ColBatch batch_;
+  };
+
+  // source data pointer.
+  std::unique_ptr<DataSource> source_;
+  // column iterator
+  ColBatchIter col_iter_;
+  // list of row index that are buffered.
+  std::vector<bst_uint> buffered_rowset_;
+  /*! \brief sizeof column data */
+  std::vector<size_t> col_size_;
+
+  // internal function to make one batch from row iter.
+  void MakeOneBatch(const std::vector<bool>& enabled,
+                    float pkeep,
+                    SparsePage *pcol);
+
+  void MakeManyBatch(const std::vector<bool>& enabled,
+                     float pkeep,
+                     size_t max_row_perbatch);
+
+  void MakeColPage(const RowBatch& batch,
+                   const bst_uint* ridx,
+                   const std::vector<bool>& enabled,
+                   SparsePage* pcol);
+};
+}  // namespace data
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_SIMPLE_DMATRIX_H_
--- a/src/data/sparse_batch_page.h
+++ b/src/data/sparse_batch_page.h
@@ -0,0 +1,214 @@
+/*!
+ * Copyright (c) 2014 by Contributors
+ * \file sparse_batch_page.h
+ *   content holder of sparse batch that can be saved to disk
+ *   the representation can be effectively
+ *   use in external memory computation
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_DATA_SPARSE_BATCH_PAGE_H_
+#define XGBOOST_DATA_SPARSE_BATCH_PAGE_H_
+
+#include <xgboost/data.h>
+#include <dmlc/io.h>
+#include <vector>
+#include <algorithm>
+
+namespace xgboost {
+namespace data {
+/*!
+ * \brief in-memory storage unit of sparse batch
+ */
+class SparsePage {
+ public:
+  /*! \brief offset of the segments */
+  std::vector<size_t> offset;
+  /*! \brief the data of the segments */
+  std::vector<SparseBatch::Entry> data;
+
+  /*! \brief constructor */
+  SparsePage() {
+    this->Clear();
+  }
+  /*! \return number of instance in the page */
+  inline size_t Size() const {
+    return offset.size() - 1;
+  }
+  /*!
+   * \brief load only the segments we are interested in
+   * \param fi the input stream of the file
+   * \param sorted_index_set sorted index of segments we are interested in
+   * \return true of the loading as successful, false if end of file was reached
+   */
+  inline bool Load(dmlc::SeekStream *fi,
+                   const std::vector<bst_uint> &sorted_index_set) {
+    if (!fi->Read(&disk_offset_)) return false;
+    // setup the offset
+    offset.clear(); offset.push_back(0);
+    for (size_t i = 0; i < sorted_index_set.size(); ++i) {
+      bst_uint fid = sorted_index_set[i];
+      CHECK_LT(fid + 1, disk_offset_.size());
+      size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
+      offset.push_back(offset.back() + size);
+    }
+    data.resize(offset.back());
+    // read in the data
+    size_t begin = fi->Tell();
+    size_t curr_offset = 0;
+    for (size_t i = 0; i < sorted_index_set.size();) {
+      bst_uint fid = sorted_index_set[i];
+      if (disk_offset_[fid] != curr_offset) {
+        CHECK_GT(disk_offset_[fid], curr_offset);
+        fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry));
+        curr_offset = disk_offset_[fid];
+      }
+      size_t j, size_to_read = 0;
+      for (j = i; j < sorted_index_set.size(); ++j) {
+        if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) {
+          size_to_read += offset[j + 1] - offset[j];
+        } else {
+          break;
+        }
+      }
+
+      if (size_to_read != 0) {
+        CHECK_EQ(fi->Read(dmlc::BeginPtr(data) + offset[i],
+                          size_to_read * sizeof(SparseBatch::Entry)),
+                 size_to_read * sizeof(SparseBatch::Entry))
+            << "Invalid SparsePage file";
+        curr_offset += size_to_read;
+      }
+      i = j;
+    }
+    // seek to end of record
+    if (curr_offset != disk_offset_.back()) {
+      fi->Seek(begin + disk_offset_.back() * sizeof(SparseBatch::Entry));
+    }
+    return true;
+  }
+  /*!
+   * \brief load all the segments
+   * \param fi the input stream of the file
+   * \return true of the loading as successful, false if end of file was reached
+   */
+  inline bool Load(dmlc::Stream *fi) {
+    if (!fi->Read(&offset)) return false;
+    CHECK_NE(offset.size(), 0) << "Invalid SparsePage file";
+    data.resize(offset.back());
+    if (data.size() != 0) {
+      CHECK_EQ(fi->Read(dmlc::BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)),
+               data.size() * sizeof(SparseBatch::Entry))
+          << "Invalid SparsePage file";
+    }
+    return true;
+  }
+  /*!
+   * \brief save the data to fo, when a page was written
+   *    to disk it must contain all the elements in the
+   * \param fo output stream
+   */
+  inline void Save(dmlc::Stream *fo) const {
+    CHECK(offset.size() != 0 && offset[0] == 0);
+    CHECK_EQ(offset.back(), data.size());
+    fo->Write(offset);
+    if (data.size() != 0) {
+      fo->Write(dmlc::BeginPtr(data), data.size() * sizeof(SparseBatch::Entry));
+    }
+  }
+  /*! \return estimation of memory cost of this page */
+  inline size_t MemCostBytes(void) const {
+    return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry);
+  }
+  /*! \brief clear the page */
+  inline void Clear(void) {
+    offset.clear();
+    offset.push_back(0);
+    data.clear();
+  }
+  /*!
+   * \brief load all the segments and add it to existing batch
+   * \param fi the input stream of the file
+   * \return true of the loading as successful, false if end of file was reached
+   */
+  inline bool PushLoad(dmlc::Stream *fi) {
+    if (!fi->Read(&disk_offset_)) return false;
+    data.resize(offset.back() + disk_offset_.back());
+    if (disk_offset_.back() != 0) {
+      CHECK_EQ(fi->Read(dmlc::BeginPtr(data) + offset.back(),
+                        disk_offset_.back() * sizeof(SparseBatch::Entry)),
+               disk_offset_.back() * sizeof(SparseBatch::Entry))
+          << "Invalid SparsePage file";
+    }
+    size_t top = offset.back();
+    size_t begin = offset.size();
+    offset.resize(offset.size() + disk_offset_.size());
+    for (size_t i = 0; i < disk_offset_.size(); ++i) {
+      offset[i + begin] = top + disk_offset_[i];
+    }
+    return true;
+  }
+  /*!
+   * \brief Push row batch into the page
+   * \param batch the row batch
+   */
+  inline void Push(const RowBatch &batch) {
+    data.resize(offset.back() + batch.ind_ptr[batch.size]);
+    std::memcpy(dmlc::BeginPtr(data) + offset.back(),
+                batch.data_ptr + batch.ind_ptr[0],
+                sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]);
+    size_t top = offset.back();
+    size_t begin = offset.size();
+    offset.resize(offset.size() + batch.size);
+    for (size_t i = 0; i < batch.size; ++i) {
+      offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0];
+    }
+  }
+  /*!
+   * \brief Push a sparse page
+   * \param batch the row page
+   */
+  inline void Push(const SparsePage &batch) {
+    size_t top = offset.back();
+    data.resize(top + batch.data.size());
+    std::memcpy(dmlc::BeginPtr(data) + top,
+                dmlc::BeginPtr(batch.data),
+                sizeof(SparseBatch::Entry) * batch.data.size());
+    size_t begin = offset.size();
+    offset.resize(begin + batch.Size());
+    for (size_t i = 0; i < batch.Size(); ++i) {
+      offset[i + begin] = top + batch.offset[i + 1];
+    }
+  }
+  /*!
+   * \brief Push one instance into page
+   *  \param row an instance row
+   */
+  inline void Push(const SparseBatch::Inst &inst) {
+    offset.push_back(offset.back() + inst.length);
+    size_t begin = data.size();
+    data.resize(begin + inst.length);
+    if (inst.length != 0) {
+      std::memcpy(dmlc::BeginPtr(data) + begin, inst.data,
+                  sizeof(SparseBatch::Entry) * inst.length);
+    }
+  }
+  /*!
+   * \param base_rowid base_rowid of the data
+   * \return row batch representation of the page
+   */
+  inline RowBatch GetRowBatch(size_t base_rowid) const {
+    RowBatch out;
+    out.base_rowid  = base_rowid;
+    out.ind_ptr = dmlc::BeginPtr(offset);
+    out.data_ptr = dmlc::BeginPtr(data);
+    out.size = offset.size() - 1;
+    return out;
+  }
+
+ private:
+  /*! \brief external memory column offset */
+  std::vector<size_t> disk_offset_;
+};
+}  // namespace data
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_SPARSE_BATCH_PAGE_H_