Merge commit '75bf97b57539e5572e7ae8eba72bac6562c63c07'

Conflicts: subtree/rabit/rabit-learn/io/line_split-inl.h subtree/rabit/yarn/build.sh
2015-03-21 00:48:34 -07:00
parent 5648bec8a3 75bf97b575
commit 9ccbeaa8f0
34 changed files with 856 additions and 201 deletions
--- a/subtree/rabit/rabit-learn/.gitignore
+++ b/subtree/rabit/rabit-learn/.gitignore
@@ -0,0 +1,2 @@
+config.mk
+*.log
--- a/subtree/rabit/rabit-learn/io/buffer_reader-inl.h
+++ b/subtree/rabit/rabit-learn/io/buffer_reader-inl.h
@@ -38,6 +38,7 @@ class StreamBufferReader {
      }
    }
  }
+  /*! \brief whether we are reaching the end of file */
  inline bool AtEnd(void) const {
    return read_len_ == 0;
  }
--- a/subtree/rabit/rabit-learn/io/file-inl.h
+++ b/subtree/rabit/rabit-learn/io/file-inl.h
@@ -66,27 +66,36 @@ class FileStream : public utils::ISeekStream {
 };

 /*! \brief line split from normal file system */
-class FileSplit : public LineSplitBase {
+class FileProvider : public LineSplitter::IFileProvider {
 public:
-  explicit FileSplit(const char *uri, unsigned rank, unsigned nsplit) {
-    LineSplitBase::SplitNames(&fnames_, uri, "#");
+  explicit FileProvider(const char *uri) {
+    LineSplitter::SplitNames(&fnames_, uri, "#");
    std::vector<size_t> fsize;
    for (size_t  i = 0; i < fnames_.size(); ++i) {
      if (!std::strncmp(fnames_[i].c_str(), "file://", 7)) {
        std::string tmp = fnames_[i].c_str() + 7;
        fnames_[i] = tmp;        
      }
-      fsize.push_back(GetFileSize(fnames_[i].c_str()));
+      size_t fz = GetFileSize(fnames_[i].c_str());
+      if (fz != 0) {
+        fsize_.push_back(fz);
+      }
    }
-    LineSplitBase::Init(fsize, rank, nsplit);
  }
-  virtual ~FileSplit(void) {}
-  
- protected:
-  virtual utils::ISeekStream *GetFile(size_t file_index) {
+  // destrucor
+  virtual ~FileProvider(void) {}  
+  virtual utils::ISeekStream *Open(size_t file_index) {
    utils::Assert(file_index < fnames_.size(), "file index exceed bound"); 
    return new FileStream(fnames_[file_index].c_str(), "rb");
  }
+  virtual const std::vector<size_t> &FileSize(void) const {
+    return fsize_;
+  }
+ private:
+  // file sizes
+  std::vector<size_t> fsize_;
+  // file names
+  std::vector<std::string> fnames_;
  // get file size
  inline static size_t GetFileSize(const char *fname) {
    std::FILE *fp = utils::FopenCheck(fname, "rb");
@@ -96,10 +105,6 @@ class FileSplit : public LineSplitBase {
    std::fclose(fp);
    return fsize;
  }
-  
- private:
-  // file names
-  std::vector<std::string> fnames_;  
 };
 }  // namespace io
 }  // namespace rabit
--- a/subtree/rabit/rabit-learn/io/hdfs-inl.h
+++ b/subtree/rabit/rabit-learn/io/hdfs-inl.h
@@ -6,6 +6,7 @@
 * \author Tianqi Chen
 */
 #include <string>
+#include <cstdlib>
 #include <vector>
 #include <hdfs.h>
 #include <errno.h>
@@ -15,11 +16,15 @@
 /*! \brief io interface */
 namespace rabit {
 namespace io {
-class HDFSStream : public utils::ISeekStream {
+class HDFSStream : public ISeekStream {
 public:
-  HDFSStream(hdfsFS fs, const char *fname, const char *mode)
-      : fs_(fs), at_end_(false) {
-    int flag;
+  HDFSStream(hdfsFS fs,
+             const char *fname,
+             const char *mode,
+             bool disconnect_when_done)
+      : fs_(fs), at_end_(false),
+        disconnect_when_done_(disconnect_when_done) {
+    int flag = 0;
    if (!strcmp(mode, "r")) {
      flag = O_RDONLY;
    } else if (!strcmp(mode, "w"))  {
@@ -35,6 +40,9 @@ class HDFSStream : public utils::ISeekStream {
  }
  virtual ~HDFSStream(void) {
    this->Close();
+    if (disconnect_when_done_) {
+      utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
+    }
  }
  virtual size_t Read(void *ptr, size_t size) {
    tSize nread = hdfsRead(fs_, fp_, ptr, size);
@@ -86,52 +94,69 @@ class HDFSStream : public utils::ISeekStream {
    }
  }  
  
+  inline static std::string GetNameNode(void) {
+    const char *nn = getenv("rabit_hdfs_namenode");
+    if (nn == NULL) {
+      return std::string("default");
+    } else {
+      return std::string(nn);
+    }
+  }
 private:
  hdfsFS fs_;
  hdfsFile fp_;
  bool at_end_;
+  bool disconnect_when_done_;
 };

 /*! \brief line split from normal file system */
-class HDFSSplit : public LineSplitBase {
+class HDFSProvider : public LineSplitter::IFileProvider {
 public:
-  explicit HDFSSplit(const char *uri, unsigned rank, unsigned nsplit) {
-    fs_ = hdfsConnect("default", 0);
+  explicit HDFSProvider(const char *uri) {
+    fs_ = hdfsConnect(HDFSStream::GetNameNode().c_str(), 0);
+    utils::Check(fs_ != NULL, "error when connecting to default HDFS");
    std::vector<std::string> paths;
-    LineSplitBase::SplitNames(&paths, uri, "#");
+    LineSplitter::SplitNames(&paths, uri, "#");
    // get the files
-    std::vector<size_t> fsize;
    for (size_t  i = 0; i < paths.size(); ++i) {
      hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
+      utils::Check(info != NULL, "path %s do not exist", paths[i].c_str());
      if (info->mKind == 'D') {
        int nentry;
        hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
+        utils::Check(files != NULL, "error when ListDirectory %s", info->mName);
        for (int i = 0; i < nentry; ++i) {
-          if (files[i].mKind == 'F') {
-            fsize.push_back(files[i].mSize);
+          if (files[i].mKind == 'F' && files[i].mSize != 0) {
+            fsize_.push_back(files[i].mSize);            
            fnames_.push_back(std::string(files[i].mName));
          }
        }
        hdfsFreeFileInfo(files, nentry);
      } else {
-        fsize.push_back(info->mSize);
-        fnames_.push_back(std::string(info->mName));
+        if (info->mSize != 0) {
+          fsize_.push_back(info->mSize);
+          fnames_.push_back(std::string(info->mName));
+        }
      }
      hdfsFreeFileInfo(info, 1);
    }
-    LineSplitBase::Init(fsize, rank, nsplit);
  }
-  virtual ~HDFSSplit(void) {}
-  
- protected:
-  virtual utils::ISeekStream *GetFile(size_t file_index) {
+  virtual ~HDFSProvider(void) {
+    utils::Check(hdfsDisconnect(fs_) == 0, "hdfsDisconnect error");
+  }  
+  virtual const std::vector<size_t> &FileSize(void) const {
+    return fsize_;
+  }
+  virtual ISeekStream *Open(size_t file_index) {
    utils::Assert(file_index < fnames_.size(), "file index exceed bound"); 
-    return new HDFSStream(fs_, fnames_[file_index].c_str(), "r");
+    return new HDFSStream(fs_, fnames_[file_index].c_str(), "r", false);
  }
-
+  
 private:
  // hdfs handle
  hdfsFS fs_;
+  // file sizes
+  std::vector<size_t> fsize_;
  // file names
  std::vector<std::string> fnames_;
 };
--- a/subtree/rabit/rabit-learn/io/io-inl.h
+++ b/subtree/rabit/rabit-learn/io/io-inl.h
@@ -30,16 +30,16 @@ inline InputSplit *CreateInputSplit(const char *uri,
    return new SingleFileSplit(uri);
  }
  if (!strncmp(uri, "file://", 7)) {
-    return new FileSplit(uri, part, nsplit);
+    return new LineSplitter(new FileProvider(uri), part, nsplit);
  }
  if (!strncmp(uri, "hdfs://", 7)) {
 #if RABIT_USE_HDFS
-    return new HDFSSplit(uri, part, nsplit);
+    return new LineSplitter(new HDFSProvider(uri), part, nsplit);
 #else
    utils::Error("Please compile with RABIT_USE_HDFS=1");
 #endif
  }
-  return new FileSplit(uri, part, nsplit);  
+  return new LineSplitter(new FileProvider(uri), part, nsplit);
 }
 /*!
 * \brief create an stream, the stream must be able to close
@@ -55,7 +55,8 @@ inline IStream *CreateStream(const char *uri, const char *mode) {
  }
  if (!strncmp(uri, "hdfs://", 7)) {
 #if RABIT_USE_HDFS
-    return new HDFSStream(hdfsConnect("default", 0), uri, mode);
+    return new HDFSStream(hdfsConnect(HDFSStream::GetNameNode().c_str(), 0),
+                          uri, mode, true);
 #else
    utils::Error("Please compile with RABIT_USE_HDFS=1");
 #endif
--- a/subtree/rabit/rabit-learn/io/io.h
+++ b/subtree/rabit/rabit-learn/io/io.h
@@ -19,6 +19,7 @@ namespace rabit {
 * \brief namespace to handle input split and filesystem interfacing
 */
 namespace io {
+/*! \brief reused ISeekStream's definition */
 typedef utils::ISeekStream ISeekStream;
 /*!
 * \brief user facing input split helper,
--- a/subtree/rabit/rabit-learn/io/line_split-inl.h
+++ b/subtree/rabit/rabit-learn/io/line_split-inl.h
@@ -15,11 +15,42 @@

 namespace rabit {
 namespace io {
-class LineSplitBase : public InputSplit {
+
+/*! \brief class that split the files by line */
+class LineSplitter : public InputSplit {
 public:
-  virtual ~LineSplitBase() {
-    if (fs_ != NULL) delete fs_;
+  class IFileProvider {
+   public:
+    /*!
+     * \brief get the seek stream of given file_index
+     * \return the corresponding seek stream at head of the stream
+     *  the seek stream's resource can be freed by calling delete 
+     */
+    virtual ISeekStream *Open(size_t file_index) = 0;
+    /*!
+     * \return const reference to size of each files
+     */
+    virtual const std::vector<size_t> &FileSize(void) const = 0;
+    // virtual destructor
+    virtual ~IFileProvider() {}
+  };
+  // constructor
+  explicit LineSplitter(IFileProvider *provider,
+                         unsigned rank,
+                         unsigned nsplit)
+      : provider_(provider), fs_(NULL),
+        reader_(kBufferSize) {
+    this->Init(provider_->FileSize(), rank, nsplit);
  }
+  // destructor
+  virtual ~LineSplitter() {
+    if (fs_ != NULL) {
+      delete fs_; fs_ = NULL;
+    }
+    // delete provider after destructing the streams
+    delete provider_;
+  }
+  // get next line
  virtual bool NextLine(std::string *out_data) {
    if (file_ptr_ >= file_ptr_end_ &&
        offset_curr_ >= offset_end_) return false;
@@ -29,15 +60,15 @@ class LineSplitBase : public InputSplit {
      if (reader_.AtEnd()) {
        if (out_data->length() != 0) return true;
        file_ptr_ += 1;
+        if (offset_curr_ >= offset_end_) return false;
        if (offset_curr_ != file_offset_[file_ptr_]) {
-          utils::Error("warning:std::FILE size not calculated correctly\n");
+          utils::Error("warning: FILE size not calculated correctly\n");
          offset_curr_ = file_offset_[file_ptr_];
        }
-        if (offset_curr_ >= offset_end_) return false;
        utils::Assert(file_ptr_ + 1 < file_offset_.size(),
                      "boundary check");
        delete fs_;
-        fs_ = this->GetFile(file_ptr_);
+        fs_ = provider_->Open(file_ptr_);
        reader_.set_stream(fs_);
      } else {
        ++offset_curr_;
@@ -51,15 +82,27 @@ class LineSplitBase : public InputSplit {
      }
    }
  }
-
- protected:
-  // constructor
-  LineSplitBase(void)
-      : fs_(NULL), reader_(kBufferSize) {
+  /*!
+   * \brief split names given 
+   * \param out_fname output std::FILE names
+   * \param uri_ the iput uri std::FILE
+   * \param dlm deliminetr
+   */
+  inline static void SplitNames(std::vector<std::string> *out_fname,
+                                const char *uri_,
+                                const char *dlm) {
+    std::string uri = uri_;
+    char *p = std::strtok(BeginPtr(uri), dlm);
+    while (p != NULL) {
+      out_fname->push_back(std::string(p));
+      p = std::strtok(NULL, dlm);
+    }
  }
+  
+ private:
  /*!
   * \brief initialize the line spliter,
-   * \param file_size, size of each std::FILEs
+   * \param file_size, size of each files
   * \param rank the current rank of the data
   * \param nsplit number of split we will divide the data into
   */
@@ -82,7 +125,7 @@ class LineSplitBase : public InputSplit {
    file_ptr_end_ = std::upper_bound(file_offset_.begin(),
                                     file_offset_.end(),
                                     offset_end_) - file_offset_.begin() - 1;
-    fs_ = GetFile(file_ptr_);
+    fs_ = provider_->Open(file_ptr_);
    reader_.set_stream(fs_);
    // try to set the starting position correctly
    if (file_offset_[file_ptr_] != offset_begin_) {
@@ -94,33 +137,15 @@ class LineSplitBase : public InputSplit {
      }
    }
  }
-  /*!
-   * \brief get the seek stream of given file_index
-   * \return the corresponding seek stream at head of std::FILE
-   */
-  virtual utils::ISeekStream *GetFile(size_t file_index) = 0;
-  /*!
-   * \brief split names given 
-   * \param out_fname output std::FILE names
-   * \param uri_ the iput uri std::FILE
-   * \param dlm deliminetr
-   */
-  inline static void SplitNames(std::vector<std::string> *out_fname,
-                                const char *uri_,
-                                const char *dlm) {
-    std::string uri = uri_;
-    char *p = std::strtok(BeginPtr(uri), dlm);
-    while (p != NULL) {
-      out_fname->push_back(std::string(p));
-      p = std::strtok(NULL, dlm);
-    }
-  }
+
 private:
+  /*! \brief FileProvider */
+  IFileProvider *provider_;
  /*! \brief current input stream */
  utils::ISeekStream *fs_;
-  /*! \brief std::FILE pointer of which std::FILE to read on */
+  /*! \brief file pointer of which file to read on */
  size_t file_ptr_;
-  /*! \brief std::FILE pointer where the end of std::FILE lies */
+  /*! \brief file pointer where the end of file lies */
  size_t file_ptr_end_;
  /*! \brief get the current offset */
  size_t offset_curr_;
@@ -128,7 +153,7 @@ class LineSplitBase : public InputSplit {
  size_t offset_begin_;
  /*! \brief end of the offset */
  size_t offset_end_;
-  /*! \brief byte-offset of each std::FILE */
+  /*! \brief byte-offset of each file */
  std::vector<size_t> file_offset_;
  /*! \brief buffer reader */
  StreamBufferReader reader_;
--- a/subtree/rabit/rabit-learn/linear/Makefile
+++ b/subtree/rabit/rabit-learn/linear/Makefile
@@ -1,4 +1,10 @@
-# specify tensor path
+ifneq ("$(wildcard ../config.mk)","")
+	config = ../config.mk
+else
+	config = ../make/config.mk
+endif
+include $(config)
+
 BIN = linear.rabit
 MOCKBIN= linear.mock
 MPIBIN = 
@@ -6,10 +12,10 @@ MPIBIN =
 OBJ = linear.o

 # common build script for programs
-include ../make/config.mk
 include ../make/common.mk
 CFLAGS+=-fopenmp
 linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
 # dependenies here
 linear.rabit: linear.o lib
 linear.mock: linear.o lib
+
--- a/subtree/rabit/rabit-learn/linear/linear.cc
+++ b/subtree/rabit/rabit-learn/linear/linear.cc
@@ -206,21 +206,22 @@ int main(int argc, char *argv[]) {
    rabit::Finalize();
    return 0;
  }
-  rabit::linear::LinearObjFunction linear;
+  rabit::linear::LinearObjFunction *linear = new rabit::linear::LinearObjFunction();
  if (!strcmp(argv[1], "stdin")) {
-    linear.LoadData(argv[1]);
+    linear->LoadData(argv[1]);
    rabit::Init(argc, argv);
  } else {
    rabit::Init(argc, argv);
-    linear.LoadData(argv[1]);
+    linear->LoadData(argv[1]);
  }
  for (int i = 2; i < argc; ++i) {
    char name[256], val[256];
    if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
-      linear.SetParam(name, val);
+      linear->SetParam(name, val);
    }
  }
-  linear.Run();
+  linear->Run();
+  delete linear;
  rabit::Finalize();
  return 0;
 }
--- a/subtree/rabit/rabit-learn/linear/linear.h
+++ b/subtree/rabit/rabit-learn/linear/linear.h
@@ -26,10 +26,11 @@ struct LinearModel {
    int reserved[16];
    // constructor
    ModelParam(void) {
+      memset(this, 0, sizeof(ModelParam));
      base_score = 0.5f;
      num_feature = 0;
      loss_type = 1;
-      std::memset(reserved, 0, sizeof(reserved));
+      num_feature = 0;
    }
    // initialize base score
    inline void InitBaseScore(void) {
@@ -119,7 +120,7 @@ struct LinearModel {
    }
    fi.Read(weight, sizeof(float) * (param.num_feature + 1));
  }
-  inline void Save(rabit::IStream &fo, const float *wptr = NULL) const {
+  inline void Save(rabit::IStream &fo, const float *wptr = NULL) {
    fo.Write(&param, sizeof(param));
    if (wptr == NULL) wptr = weight;
    fo.Write(wptr, sizeof(float) * (param.num_feature + 1));
--- a/subtree/rabit/rabit-learn/linear/run-yarn.sh
+++ b/subtree/rabit/rabit-learn/linear/run-yarn.sh
@@ -6,12 +6,13 @@ then
 fi

 # put the local training file to HDFS
-hadoop fs -rm -r -f $2/data
 hadoop fs -rm -r -f $2/mushroom.linear.model
+
 hadoop fs -mkdir $2/data
+hadoop fs -put ../data/agaricus.txt.train $2/data

 # submit to hadoop
-../../tracker/rabit_yarn.py  -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}" 
+../../tracker/rabit_yarn.py  -n $1 --vcores 1  ./linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}" 

 # get the final model file
 hadoop fs -get $2/mushroom.linear.model ./linear.model
--- a/subtree/rabit/rabit-learn/make/config.mk
+++ b/subtree/rabit/rabit-learn/make/config.mk
@@ -6,7 +6,7 @@
 #
 #  - copy this file to the root of rabit-learn folder
 #  - modify the configuration you want
-#  - type make or make -j n for parallel build
+#  - type make or make -j n on each of the folder
 #----------------------------------------------------

 # choice of compiler
--- a/subtree/rabit/rabit-learn/solver/lbfgs.h
+++ b/subtree/rabit/rabit-learn/solver/lbfgs.h
@@ -145,8 +145,9 @@ class LBFGSSolver {
      
      if (silent == 0 && rabit::GetRank() == 0) {
        rabit::TrackerPrintf
-            ("L-BFGS solver starts, num_dim=%lu, init_objval=%g, size_memory=%lu\n",
-             gstate.num_dim, gstate.init_objval, gstate.size_memory);
+            ("L-BFGS solver starts, num_dim=%lu, init_objval=%g, size_memory=%lu, RAM-approx=%lu\n",
+             gstate.num_dim, gstate.init_objval, gstate.size_memory,
+             gstate.MemCost() + hist.MemCost());
      }
    }
  }
@@ -176,7 +177,7 @@ class LBFGSSolver {
    // swap new weight 
    std::swap(g.weight, g.grad);
    // check stop condition
-    if (gstate.num_iteration > min_lbfgs_iter) {
+    if (gstate.num_iteration > static_cast<size_t>(min_lbfgs_iter)) {
      if (g.old_objval - g.new_objval < lbfgs_stop_tol * g.init_objval) {
        return true;
      }
@@ -195,7 +196,7 @@ class LBFGSSolver {
  /*! \brief run optimization */
  virtual void Run(void) {
    this->Init();
-    while (gstate.num_iteration < max_lbfgs_iter) {
+    while (gstate.num_iteration < static_cast<size_t>(max_lbfgs_iter)) {
      if (this->UpdateOneIter()) break;
    }
    if (silent == 0 && rabit::GetRank() == 0) {
@@ -225,7 +226,7 @@ class LBFGSSolver {
    const size_t num_dim = gstate.num_dim;
    const DType *gsub = grad + range_begin_;
    const size_t nsub = range_end_ - range_begin_;
-    double vdot;
+    double vdot = 0.0;
    if (n != 0) {
      // hist[m + n - 1] stores old gradient
      Minus(hist[m + n - 1], gsub, hist[m + n - 1], nsub);
@@ -241,15 +242,19 @@ class LBFGSSolver {
        idxset.push_back(std::make_pair(m + j, 2 * m));
        idxset.push_back(std::make_pair(m + j, m + n - 1));
      }
+
      // calculate dot products
      std::vector<double> tmp(idxset.size());
      for (size_t i = 0; i < tmp.size(); ++i) {
        tmp[i] = hist.CalcDot(idxset[i].first, idxset[i].second);
      }
+
      rabit::Allreduce<rabit::op::Sum>(BeginPtr(tmp), tmp.size());
+
      for (size_t i = 0; i < tmp.size(); ++i) {
        gstate.DotBuf(idxset[i].first, idxset[i].second) = tmp[i];
      }
+
      // BFGS steps, use vector-free update
      // parameterize vector using basis in hist
      std::vector<double> alpha(n);
@@ -263,7 +268,7 @@ class LBFGSSolver {
        }
        alpha[j] = vsum / gstate.DotBuf(j, m + j);
        delta[m + j] = delta[m + j] - alpha[j];
-      }
+      }      
      // scale
      double scale = gstate.DotBuf(n - 1, m + n - 1) /
      gstate.DotBuf(m + n - 1, m + n - 1);
@@ -279,6 +284,7 @@ class LBFGSSolver {
        double beta = vsum / gstate.DotBuf(j, m + j);
        delta[j] = delta[j] + (alpha[j] - beta);
      }
+
      // set all to zero
      std::fill(dir, dir + num_dim, 0.0f);
      DType *dirsub = dir + range_begin_; 
@@ -291,10 +297,11 @@ class LBFGSSolver {
      }
      FixDirL1Sign(dirsub, hist[2 * m], nsub);
      vdot = -Dot(dirsub, hist[2 * m], nsub);
+
      // allreduce to get full direction
      rabit::Allreduce<rabit::op::Sum>(dir, num_dim);
      rabit::Allreduce<rabit::op::Sum>(&vdot, 1);
-    } else {     
+    } else {
      SetL1Dir(dir, grad, weight, num_dim);
      vdot = -Dot(dir, dir, num_dim);
    }
@@ -482,6 +489,7 @@ class LBFGSSolver {
      num_iteration = 0;
      num_dim = 0;
      old_objval = 0.0;
+      offset_ = 0;
    }
    ~GlobalState(void) {
      if (grad != NULL) {
@@ -496,6 +504,10 @@ class LBFGSSolver {
      data.resize(n * n, 0.0);
      this->AllocSpace();
    }
+    // memory cost
+    inline size_t MemCost(void) const {
+      return sizeof(DType) * 3 * num_dim;
+    }
    inline double &DotBuf(size_t i, size_t j)  {
      if (i > j) std::swap(i, j);
      return data[MapIndex(i, offset_, size_memory) * (size_memory * 2 + 1) +
@@ -565,6 +577,10 @@ class LBFGSSolver {
      size_t n = size_memory * 2 + 1;
      dptr_ = new DType[n * stride_];
    }
+    // memory cost
+    inline size_t MemCost(void) const {
+      return sizeof(DType) * (size_memory_ * 2 + 1) * stride_;
+    }
    // fetch element from rolling array
    inline const DType *operator[](size_t i) const {
      return dptr_ + MapIndex(i, offset_, size_memory_) * stride_;
--- a/subtree/rabit/rabit-learn/utils/data.h
+++ b/subtree/rabit/rabit-learn/utils/data.h
@@ -77,11 +77,15 @@ struct SparseMat {
    feat_dim += 1;
    utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
                 "feature dimension exceed limit of index_t"\
-                 "consider change the index_t to unsigned long");
+                 "consider change the index_t to unsigned long");    
  }
  inline size_t NumRow(void) const {
    return row_ptr.size() - 1;
  }
+  // memory cost
+  inline size_t MemCost(void) const {
+    return data.size() * sizeof(Entry);
+  }
  // maximum feature dimension
  size_t feat_dim;
  std::vector<size_t> row_ptr;