Merge commit '57b5d7873f4f0953357e9d98e9c60cff8373d7ec'

2015-03-09 13:28:38 -07:00
parent 66cf88f7b0 57b5d7873f
commit 9f7c6fe271
43 changed files with 1797 additions and 235 deletions
--- a/subtree/rabit/rabit-learn/README.md
+++ b/subtree/rabit/rabit-learn/README.md
@@ -5,15 +5,13 @@ It also contain links to the Machine Learning packages that uses rabit.

 * Contribution of toolkits, examples, benchmarks is more than welcomed!

+
 Toolkits
 ====
 * [KMeans Clustering](kmeans)
-* [Linear and Logistic Regression](linear)
-  
+* [Linear and Logistic Regression](linear)  
 * [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/master/multi-node)
  - xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
    10 times faster than existing packages
  - Rabit carries xgboost to distributed enviroment, inheritating all the benefits of xgboost
    single node version, and scale it to even larger problems
-
-
--- a/subtree/rabit/rabit-learn/io/base64-inl.h
+++ b/subtree/rabit/rabit-learn/io/base64-inl.h
@@ -1,5 +1,5 @@
-#ifndef RABIT_LEARN_UTILS_BASE64_H_
-#define RABIT_LEARN_UTILS_BASE64_H_
+#ifndef RABIT_LEARN_IO_BASE64_INL_H_
+#define RABIT_LEARN_IO_BASE64_INL_H_
 /*!
 * \file base64.h
 * \brief data stream support to input and output from/to base64 stream
@@ -8,10 +8,11 @@
 */
 #include <cctype>
 #include <cstdio>
-#include <rabit/io.h>
+#include "./io.h"
+#include "./buffer_reader-inl.h"

 namespace rabit {
-namespace utils {
+namespace io {
 /*! \brief namespace of base64 decoding and encoding table */
 namespace base64 {
 const char DecodeTable[] = {
@@ -34,7 +35,8 @@ static const char EncodeTable[] =
 /*! \brief the stream that reads from base64, note we take from file pointers */
 class Base64InStream: public IStream {
 public:
-  explicit Base64InStream(FILE *fp) : fp(fp) {
+  explicit Base64InStream(IStream *fs) : reader_(256) {
+    reader_.set_stream(fs);
    num_prev = 0; tmp_ch = 0;
  }
  /*! 
@@ -44,7 +46,7 @@ class Base64InStream: public IStream {
  inline void InitPosition(void) {
    // get a charater
    do {
-      tmp_ch = fgetc(fp);
+      tmp_ch = reader_.GetChar();
    } while (isspace(tmp_ch));
  }
  /*! \brief whether current position is end of a base64 stream */
@@ -85,19 +87,19 @@ class Base64InStream: public IStream {
      nvalue = DecodeTable[tmp_ch] << 18;
      {
        // second byte
-        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
              "invalid base64 format");
        nvalue |= DecodeTable[tmp_ch] << 12;
        *cptr++ = (nvalue >> 16) & 0xFF; --tlen;
      }
      {
        // third byte
-        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
              "invalid base64 format");
        // handle termination
        if (tmp_ch == '=') {
-          Check((tmp_ch = fgetc(fp), tmp_ch == '='), "invalid base64 format");
-          Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
+          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == '='), "invalid base64 format");
+          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
                "invalid base64 format");
          break;
        }
@@ -110,10 +112,10 @@ class Base64InStream: public IStream {
      }
      {
        // fourth byte
-        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
              "invalid base64 format");
        if (tmp_ch == '=') {
-          Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
+          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
                "invalid base64 format");
          break;
        }
@@ -125,10 +127,10 @@ class Base64InStream: public IStream {
        }
      }
      // get next char
-      tmp_ch = fgetc(fp);
+      tmp_ch = reader_.GetChar();
    }
    if (kStrictCheck) {
-      Check(tlen == 0, "Base64InStream: read incomplete");
+      utils::Check(tlen == 0, "Base64InStream: read incomplete");
    }
    return size - tlen;
  }
@@ -137,7 +139,7 @@ class Base64InStream: public IStream {
  }

 private:
-  FILE *fp;
+  StreamBufferReader reader_;
  int tmp_ch;
  int num_prev;
  unsigned char buf_prev[2];
@@ -147,7 +149,7 @@ class Base64InStream: public IStream {
 /*! \brief the stream that write to base64, note we take from file pointers */
 class Base64OutStream: public IStream {
 public:
-  explicit Base64OutStream(FILE *fp) : fp(fp) {
+  explicit Base64OutStream(IStream *fp) : fp(fp) {
    buf_top = 0;
  }
  virtual void Write(const void *ptr, size_t size) {
@@ -160,16 +162,16 @@ class Base64OutStream: public IStream {
      }
      if (buf_top == 3) {
        // flush 4 bytes out
-        fputc(EncodeTable[buf[1] >> 2], fp);
-        fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
-        fputc(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F], fp);
-        fputc(EncodeTable[buf[3] & 0x3F], fp);
+        PutChar(EncodeTable[buf[1] >> 2]);
+        PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
+        PutChar(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F]);
+        PutChar(EncodeTable[buf[3] & 0x3F]);
        buf_top = 0;
      }
    }
  }
  virtual size_t Read(void *ptr, size_t size) {
-    Error("Base64OutStream do not support read");
+    utils::Error("Base64OutStream do not support read");
    return 0;
  }
  /*!
@@ -179,26 +181,38 @@ class Base64OutStream: public IStream {
  inline void Finish(char endch = EOF) {
    using base64::EncodeTable;
    if (buf_top == 1) {
-      fputc(EncodeTable[buf[1] >> 2], fp);
-      fputc(EncodeTable[(buf[1] << 4) & 0x3F], fp);
-      fputc('=', fp);
-      fputc('=', fp);
+      PutChar(EncodeTable[buf[1] >> 2]);
+      PutChar(EncodeTable[(buf[1] << 4) & 0x3F]);
+      PutChar('=');
+      PutChar('=');
    }
    if (buf_top == 2) {
-      fputc(EncodeTable[buf[1] >> 2], fp);
-      fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
-      fputc(EncodeTable[(buf[2] << 2) & 0x3F], fp);
-      fputc('=', fp);
+      PutChar(EncodeTable[buf[1] >> 2]);
+      PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
+      PutChar(EncodeTable[(buf[2] << 2) & 0x3F]);
+      PutChar('=');
    }
    buf_top = 0;
-    if (endch != EOF) fputc(endch, fp);
+    if (endch != EOF) PutChar(endch);
+    this->Flush();
  }
-
- private:
-  FILE *fp;
+    
+ private:  
+  IStream *fp;
  int buf_top;
  unsigned char buf[4];
+  std::string out_buf;
+  const static size_t kBufferSize = 256;
+
+  inline void PutChar(char ch) {
+    out_buf += ch;
+    if (out_buf.length() >= kBufferSize) Flush();
+  }
+  inline void Flush(void) {
+    fp->Write(BeginPtr(out_buf), out_buf.length());
+    out_buf.clear();
+  }
 };
 }  // namespace utils
 }  // namespace rabit
-#endif  // RABIT_LEARN_UTILS_BASE64_H_
+#endif  // RABIT_LEARN_UTILS_BASE64_INL_H_
--- a/subtree/rabit/rabit-learn/io/buffer_reader-inl.h
+++ b/subtree/rabit/rabit-learn/io/buffer_reader-inl.h
@@ -0,0 +1,57 @@
+#ifndef RABIT_LEARN_IO_BUFFER_READER_INL_H_
+#define RABIT_LEARN_IO_BUFFER_READER_INL_H_
+/*!
+ * \file buffer_reader-inl.h
+ * \brief implementation of stream buffer reader
+ * \author Tianqi Chen
+ */
+#include "./io.h"
+
+namespace rabit {
+namespace io {
+/*! \brief buffer reader of the stream that allows you to get */
+class StreamBufferReader {
+ public:
+  StreamBufferReader(size_t buffer_size)
+      :stream_(NULL),
+       read_len_(1), read_ptr_(1) {
+    buffer_.resize(buffer_size);
+  }
+  /*!
+   * \brief set input stream
+   */
+  inline void set_stream(IStream *stream) {
+    stream_ = stream;
+    read_len_ = read_ptr_ = 1;
+  }
+  /*!
+   * \brief allows quick read using get char
+   */
+  inline char GetChar(void) {
+    while (true) {
+      if (read_ptr_ < read_len_) {
+        return buffer_[read_ptr_++];
+      } else {
+        read_len_ = stream_->Read(&buffer_[0], buffer_.length());
+        if (read_len_ == 0) return EOF;
+        read_ptr_ = 0;
+      }
+    }
+  }
+  inline bool AtEnd(void) const {
+    return read_len_ == 0;
+  }
+  
+ private:
+  /*! \brief the underlying stream */
+  IStream *stream_;
+  /*! \brief buffer to hold data */
+  std::string buffer_;
+  /*! \brief length of valid data in buffer */
+  size_t read_len_;
+  /*! \brief pointer in the buffer */
+  size_t read_ptr_;
+};
+}  // namespace io
+}  // namespace rabit
+#endif  // RABIT_LEARN_IO_BUFFER_READER_INL_H_
--- a/subtree/rabit/rabit-learn/io/file-inl.h
+++ b/subtree/rabit/rabit-learn/io/file-inl.h
@@ -0,0 +1,106 @@
+#ifndef RABIT_LEARN_IO_FILE_INL_H_
+#define RABIT_LEARN_IO_FILE_INL_H_
+/*!
+ * \file file-inl.h
+ * \brief normal filesystem I/O
+ * \author Tianqi Chen
+ */
+#include <string>
+#include <vector>
+#include <cstdio>
+#include "./io.h"
+#include "./line_split-inl.h"
+
+/*! \brief io interface */
+namespace rabit {
+namespace io {
+/*! \brief implementation of file i/o stream */
+class FileStream : public utils::ISeekStream {
+ public:
+  explicit FileStream(const char *fname, const char *mode)
+      : use_stdio(false) {
+#ifndef RABIT_STRICT_CXX98_
+    if (!strcmp(fname, "stdin")) {
+      use_stdio = true; fp = stdin;
+    }
+    if (!strcmp(fname, "stdout")) {
+      use_stdio = true; fp = stdout;
+    }
+#endif
+    if (!strncmp(fname, "file://", 7)) fname += 7;
+    if (!use_stdio) {
+      std::string flag = mode;
+      if (flag == "w") flag = "wb";
+      if (flag == "r") flag = "rb";
+      fp = utils::FopenCheck(fname, flag.c_str());
+    }
+  }
+  virtual ~FileStream(void) {
+    this->Close();
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    return std::fread(ptr, 1, size, fp);
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    std::fwrite(ptr, size, 1, fp);
+  }
+  virtual void Seek(size_t pos) {
+    std::fseek(fp, static_cast<long>(pos), SEEK_SET);
+  }
+  virtual size_t Tell(void) {
+    return std::ftell(fp);
+  }
+  virtual bool AtEnd(void) const {
+    return feof(fp) != 0;
+  }
+  inline void Close(void) {
+    if (fp != NULL && !use_stdio) {
+      std::fclose(fp); fp = NULL;
+    }
+  }
+
+ private:
+  FILE *fp;
+  bool use_stdio;
+};
+
+/*! \brief line split from normal file system */
+class FileSplit : public LineSplitBase {
+ public:
+  explicit FileSplit(const char *uri, unsigned rank, unsigned nsplit) {
+    LineSplitBase::SplitNames(&fnames_, uri, "#");
+    std::vector<size_t> fsize;
+    for (size_t  i = 0; i < fnames_.size(); ++i) {
+      if (!strncmp(fnames_[i].c_str(), "file://", 7)) {
+        std::string tmp = fnames_[i].c_str() + 7;
+        fnames_[i] = tmp;        
+      }
+      fsize.push_back(GetFileSize(fnames_[i].c_str()));
+    }
+    LineSplitBase::Init(fsize, rank, nsplit);
+  }
+  virtual ~FileSplit(void) {}
+  
+ protected:
+  virtual utils::ISeekStream *GetFile(size_t file_index) {
+    utils::Assert(file_index < fnames_.size(), "file index exceed bound"); 
+    return new FileStream(fnames_[file_index].c_str(), "rb");
+  }
+  // get file size
+  inline static size_t GetFileSize(const char *fname) {
+    FILE *fp = utils::FopenCheck(fname, "rb");
+    // NOTE: fseek may not be good, but serves as ok solution
+    fseek(fp, 0, SEEK_END);
+    size_t fsize = static_cast<size_t>(ftell(fp));
+    fclose(fp);
+    return fsize;
+  }
+  
+ private:
+  // file names
+  std::vector<std::string> fnames_;  
+};
+}  // namespace io
+}  // namespace rabit
+#endif  // RABIT_LEARN_IO_FILE_INL_H_
+
--- a/subtree/rabit/rabit-learn/io/hdfs-inl.h
+++ b/subtree/rabit/rabit-learn/io/hdfs-inl.h
@@ -0,0 +1,140 @@
+#ifndef RABIT_LEARN_IO_HDFS_INL_H_
+#define RABIT_LEARN_IO_HDFS_INL_H_
+/*!
+ * \file hdfs-inl.h
+ * \brief HDFS I/O
+ * \author Tianqi Chen
+ */
+#include <string>
+#include <vector>
+#include <hdfs.h>
+#include <errno.h>
+#include "./io.h"
+#include "./line_split-inl.h"
+
+/*! \brief io interface */
+namespace rabit {
+namespace io {
+class HDFSStream : public utils::ISeekStream {
+ public:
+  HDFSStream(hdfsFS fs, const char *fname, const char *mode)
+      : fs_(fs), at_end_(false) {
+    int flag;
+    if (!strcmp(mode, "r")) {
+      flag = O_RDONLY;
+    } else if (!strcmp(mode, "w"))  {
+      flag = O_WRONLY;
+    } else if (!strcmp(mode, "a"))  {
+      flag = O_WRONLY | O_APPEND;
+    } else {
+      utils::Error("HDFSStream: unknown flag %s", mode);
+    }
+    fp_ = hdfsOpenFile(fs_, fname, flag, 0, 0, 0);
+    utils::Check(fp_ != NULL,
+                 "HDFSStream: fail to open %s", fname);
+  }
+  virtual ~HDFSStream(void) {
+    this->Close();
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    tSize nread = hdfsRead(fs_, fp_, ptr, size);
+    if (nread == -1) {
+      int errsv = errno;
+      utils::Error("HDFSStream.Read Error:%s", strerror(errsv));
+    }
+    if (nread == 0) {
+      at_end_ = true;
+    }
+    return static_cast<size_t>(nread);
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    const char *buf = reinterpret_cast<const char*>(ptr);
+    while (size != 0) {
+      tSize nwrite = hdfsWrite(fs_, fp_, buf, size);
+      if (nwrite == -1) {
+        int errsv = errno;
+        utils::Error("HDFSStream.Write Error:%s", strerror(errsv));
+      }
+      size_t sz = static_cast<size_t>(nwrite);
+      buf += sz; size -= sz;
+    }
+  }
+  virtual void Seek(size_t pos) {
+    if (hdfsSeek(fs_, fp_, pos) != 0) {
+      int errsv = errno;
+      utils::Error("HDFSStream.Seek Error:%s", strerror(errsv));
+    }
+  }
+  virtual size_t Tell(void) {
+    tOffset offset = hdfsTell(fs_, fp_);
+    if (offset == -1) {
+      int errsv = errno;
+      utils::Error("HDFSStream.Tell Error:%s", strerror(errsv));
+    }
+    return static_cast<size_t>(offset);
+  }
+  virtual bool AtEnd(void) const {
+    return at_end_;
+  }
+  inline void Close(void) {
+    if (fp_ != NULL) {
+      if (hdfsCloseFile(fs_, fp_) == -1) {
+        int errsv = errno;
+        utils::Error("HDFSStream.Close Error:%s", strerror(errsv));
+      }
+      fp_ = NULL;
+    }
+  }  
+  
+ private:
+  hdfsFS fs_;
+  hdfsFile fp_;
+  bool at_end_;
+};
+
+/*! \brief line split from normal file system */
+class HDFSSplit : public LineSplitBase {
+ public:
+  explicit HDFSSplit(const char *uri, unsigned rank, unsigned nsplit) {
+    fs_ = hdfsConnect("default", 0);
+    std::vector<std::string> paths;
+    LineSplitBase::SplitNames(&paths, uri, "#");
+    // get the files
+    std::vector<size_t> fsize;
+    for (size_t  i = 0; i < paths.size(); ++i) {
+      hdfsFileInfo *info = hdfsGetPathInfo(fs_, paths[i].c_str());
+      if (info->mKind == 'D') {
+        int nentry;
+        hdfsFileInfo *files = hdfsListDirectory(fs_, info->mName, &nentry);
+        for (int i = 0; i < nentry; ++i) {
+          if (files[i].mKind == 'F') {
+            fsize.push_back(files[i].mSize);
+            fnames_.push_back(std::string(files[i].mName));
+          }
+        }
+        hdfsFreeFileInfo(files, nentry);
+      } else {
+        fsize.push_back(info->mSize);
+        fnames_.push_back(std::string(info->mName));
+      }
+      hdfsFreeFileInfo(info, 1);
+    }
+    LineSplitBase::Init(fsize, rank, nsplit);
+  }
+  virtual ~HDFSSplit(void) {}
+  
+ protected:
+  virtual utils::ISeekStream *GetFile(size_t file_index) {
+    utils::Assert(file_index < fnames_.size(), "file index exceed bound"); 
+    return new HDFSStream(fs_, fnames_[file_index].c_str(), "r");
+  }
+
+ private:
+  // hdfs handle
+  hdfsFS fs_;
+  // file names
+  std::vector<std::string> fnames_;
+};
+}  // namespace io
+}  // namespace rabit
+#endif  // RABIT_LEARN_IO_HDFS_INL_H_
--- a/subtree/rabit/rabit-learn/io/io-inl.h
+++ b/subtree/rabit/rabit-learn/io/io-inl.h
@@ -0,0 +1,65 @@
+#ifndef RABIT_LEARN_IO_IO_INL_H_
+#define RABIT_LEARN_IO_IO_INL_H_
+/*!
+ * \file io-inl.h
+ * \brief Input/Output utils that handles read/write
+ *        of files in distrubuted enviroment
+ * \author Tianqi Chen
+ */
+#include <cstring>
+
+#include "./io.h"
+#if RABIT_USE_HDFS
+#include "./hdfs-inl.h"
+#endif
+#include "./file-inl.h"
+
+namespace rabit {
+namespace io {
+/*!
+ * \brief create input split given a uri
+ * \param uri the uri of the input, can contain hdfs prefix
+ * \param part the part id of current input
+ * \param nsplit total number of splits
+ */
+inline InputSplit *CreateInputSplit(const char *uri,
+                                    unsigned part,
+                                    unsigned nsplit) {
+  if (!strcmp(uri, "stdin")) {
+    return new SingleFileSplit(uri);
+  }
+  if (!strncmp(uri, "file://", 7)) {
+    return new FileSplit(uri, part, nsplit);
+  }
+  if (!strncmp(uri, "hdfs://", 7)) {
+#if RABIT_USE_HDFS
+    return new HDFSSplit(uri, part, nsplit);
+#else
+    utils::Error("Please compile with RABIT_USE_HDFS=1");
+#endif
+  }
+  return new FileSplit(uri, part, nsplit);  
+}
+/*!
+ * \brief create an stream, the stream must be able to close
+ *    the underlying resources(files) when deleted
+ *
+ * \param uri the uri of the input, can contain hdfs prefix
+ * \param mode can be 'w' or 'r' for read or write
+ */
+inline IStream *CreateStream(const char *uri, const char *mode) {
+  if (!strncmp(uri, "file://", 7)) {
+    return new FileStream(uri + 7, mode);
+  }
+  if (!strncmp(uri, "hdfs://", 7)) {
+#if RABIT_USE_HDFS
+    return new HDFSStream(hdfsConnect("default", 0), uri, mode);
+#else
+    utils::Error("Please compile with RABIT_USE_HDFS=1");
+#endif
+  }
+  return new FileStream(uri, mode);
+}
+}  // namespace io
+}  // namespace rabit
+#endif  // RABIT_LEARN_IO_IO_INL_H_
--- a/subtree/rabit/rabit-learn/io/io.h
+++ b/subtree/rabit/rabit-learn/io/io.h
@@ -0,0 +1,61 @@
+#ifndef RABIT_LEARN_IO_IO_H_
+#define RABIT_LEARN_IO_IO_H_
+/*!
+ * \file io.h
+ * \brief Input/Output utils that handles read/write
+ *        of files in distrubuted enviroment
+ * \author Tianqi Chen
+ */
+#include "../../include/rabit_serializable.h"
+
+/*! \brief whether compile with HDFS support */
+#ifndef RABIT_USE_HDFS
+#define RABIT_USE_HDFS 0
+#endif
+
+/*! \brief io interface */
+namespace rabit {
+/*!
+ * \brief namespace to handle input split and filesystem interfacing
+ */
+namespace io {
+typedef utils::ISeekStream ISeekStream;
+/*!
+ * \brief user facing input split helper,
+ *   can be used to get the partition of data used by current node
+ */
+class InputSplit {
+ public:
+  /*!
+   * \brief get next line, store into out_data
+   * \param out_data the string that stores the line data,
+   *        \n is not included
+   * \return true of next line was found, false if we read all the lines
+   */
+  virtual bool NextLine(std::string *out_data) = 0;
+  /*! \brief destructor*/
+  virtual ~InputSplit(void) {}
+};
+/*!
+ * \brief create input split given a uri
+ * \param uri the uri of the input, can contain hdfs prefix
+ * \param part the part id of current input
+ * \param nsplit total number of splits
+ */
+inline InputSplit *CreateInputSplit(const char *uri,
+                                    unsigned part,
+                                    unsigned nsplit);
+/*!
+ * \brief create an stream, the stream must be able to close
+ *    the underlying resources(files) when deleted
+ *
+ * \param uri the uri of the input, can contain hdfs prefix
+ * \param mode can be 'w' or 'r' for read or write
+ */
+inline IStream *CreateStream(const char *uri, const char *mode);
+}  // namespace io
+}  // namespace rabit
+
+#include "./io-inl.h"
+#include "./base64-inl.h"
+#endif  // RABIT_LEARN_IO_IO_H_
--- a/subtree/rabit/rabit-learn/io/line_split-inl.h
+++ b/subtree/rabit/rabit-learn/io/line_split-inl.h
@@ -0,0 +1,181 @@
+#ifndef RABIT_LEARN_IO_LINE_SPLIT_INL_H_
+#define RABIT_LEARN_IO_LINE_SPLIT_INL_H_
+/*!
+ * \file line_split-inl.h
+ * \brief base implementation of line-spliter
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <utility>
+#include <cstring>
+#include <string>
+#include "../../include/rabit.h"
+#include "./io.h"
+#include "./buffer_reader-inl.h"
+
+namespace rabit {
+namespace io {
+class LineSplitBase : public InputSplit {
+ public:
+  virtual ~LineSplitBase() {
+    if (fs_ != NULL) delete fs_;
+  }
+  virtual bool NextLine(std::string *out_data) {
+    if (file_ptr_ >= file_ptr_end_ &&
+        offset_curr_ >= offset_end_) return false;
+    out_data->clear();
+    while (true) {
+      char c = reader_.GetChar();
+      if (reader_.AtEnd()) {
+        if (out_data->length() != 0) return true;
+        file_ptr_ += 1;
+        if (offset_curr_ != file_offset_[file_ptr_]) {
+          utils::Error("warning:file size not calculated correctly\n");
+          offset_curr_ = file_offset_[file_ptr_];
+        }
+        if (offset_curr_ >= offset_end_) return false;
+        utils::Assert(file_ptr_ + 1 < file_offset_.size(),
+                      "boundary check");
+        delete fs_;
+        fs_ = this->GetFile(file_ptr_);
+        reader_.set_stream(fs_);
+      } else {
+        ++offset_curr_;
+        if (c != '\r' && c != '\n' && c != EOF) {
+          *out_data += c;
+        } else {
+          if (out_data->length() != 0) return true;
+          if (file_ptr_ >= file_ptr_end_ &&
+              offset_curr_ >= offset_end_) return false;
+        }
+      }
+    }
+  }
+
+ protected:
+  // constructor
+  LineSplitBase(void)
+      : fs_(NULL), reader_(kBufferSize) {
+  }
+  /*!
+   * \brief initialize the line spliter,
+   * \param file_size, size of each files
+   * \param rank the current rank of the data
+   * \param nsplit number of split we will divide the data into
+   */
+  inline void Init(const std::vector<size_t> &file_size,
+                   unsigned rank, unsigned nsplit) {
+    file_offset_.resize(file_size.size() + 1);
+    file_offset_[0] = 0;
+    for (size_t i = 0; i < file_size.size(); ++i) {
+      file_offset_[i + 1] = file_offset_[i] + file_size[i];
+    }
+    size_t ntotal = file_offset_.back();
+    size_t nstep = (ntotal + nsplit - 1) / nsplit;
+    offset_begin_ = std::min(nstep * rank, ntotal);
+    offset_end_ = std::min(nstep * (rank + 1), ntotal);    
+    offset_curr_ = offset_begin_;
+    if (offset_begin_ == offset_end_) return;
+    file_ptr_ = std::upper_bound(file_offset_.begin(),
+                                 file_offset_.end(),
+                                 offset_begin_) - file_offset_.begin() - 1;
+    file_ptr_end_ = std::upper_bound(file_offset_.begin(),
+                                     file_offset_.end(),
+                                     offset_end_) - file_offset_.begin() - 1;
+    fs_ = GetFile(file_ptr_);
+    reader_.set_stream(fs_);
+    // try to set the starting position correctly
+    if (file_offset_[file_ptr_] != offset_begin_) {
+      fs_->Seek(offset_begin_ - file_offset_[file_ptr_]);
+      while (true) {
+        char c = reader_.GetChar(); 
+        if (!reader_.AtEnd()) ++offset_curr_;
+        if (c == '\n' || c == '\r' || c == EOF) return;
+      }
+    }
+  }
+  /*!
+   * \brief get the seek stream of given file_index
+   * \return the corresponding seek stream at head of file
+   */
+  virtual utils::ISeekStream *GetFile(size_t file_index) = 0;
+  /*!
+   * \brief split names given 
+   * \param out_fname output file names
+   * \param uri_ the iput uri file
+   * \param dlm deliminetr
+   */
+  inline static void SplitNames(std::vector<std::string> *out_fname,
+                                const char *uri_,
+                                const char *dlm) {
+    std::string uri = uri_;
+    char *p = strtok(BeginPtr(uri), dlm);
+    while (p != NULL) {
+      out_fname->push_back(std::string(p));
+      p = strtok(NULL, dlm);
+    }
+  }
+ private:
+  /*! \brief current input stream */
+  utils::ISeekStream *fs_;
+  /*! \brief file pointer of which file to read on */
+  size_t file_ptr_;
+  /*! \brief file pointer where the end of file lies */
+  size_t file_ptr_end_;
+  /*! \brief get the current offset */
+  size_t offset_curr_;
+  /*! \brief beginning of offset */
+  size_t offset_begin_;
+  /*! \brief end of the offset */
+  size_t offset_end_;
+  /*! \brief byte-offset of each file */
+  std::vector<size_t> file_offset_;
+  /*! \brief buffer reader */
+  StreamBufferReader reader_;
+  /*! \brief buffer size */
+  const static size_t kBufferSize = 256;  
+};
+
+/*! \brief line split from single file */
+class SingleFileSplit : public InputSplit {
+ public:
+  explicit SingleFileSplit(const char *fname) {
+    if (!strcmp(fname, "stdin")) {
+#ifndef RABIT_STRICT_CXX98_
+      use_stdin_ = true; fp_ = stdin;
+#endif
+    }
+    if (!use_stdin_) {
+      fp_ = utils::FopenCheck(fname, "r");
+    }
+    end_of_file_ = false;
+  }
+  virtual ~SingleFileSplit(void) {
+    if (!use_stdin_) fclose(fp_);
+  }
+  virtual bool NextLine(std::string *out_data) {
+    if (end_of_file_) return false;
+    out_data->clear();
+    while (true) {
+      char c = fgetc(fp_);
+      if (c == EOF) {
+        end_of_file_ = true;
+      }
+      if (c != '\r' && c != '\n' && c != EOF) {
+        *out_data += c;
+      } else {
+        if (out_data->length() != 0) return true;
+        if (end_of_file_) return false;
+      }
+    }
+    return false;
+  }  
+    
+ private:
+  FILE *fp_;
+  bool use_stdin_;
+  bool end_of_file_;
+};
+}  // namespace io
+}  // namespace rabit
+#endif  // RABIT_LEARN_IO_LINE_SPLIT_INL_H_
--- a/subtree/rabit/rabit-learn/kmeans/Makefile
+++ b/subtree/rabit/rabit-learn/kmeans/Makefile
@@ -6,11 +6,10 @@ MPIBIN = kmeans.mpi
 OBJ = kmeans.o

 # common build script for programs
-include ../common.mk
+include ../make/common.mk

 # dependenies here
 kmeans.rabit: kmeans.o lib
 kmeans.mock: kmeans.o lib
 kmeans.mpi: kmeans.o libmpi
 kmeans.o: kmeans.cc ../../src/*.h
-
--- a/subtree/rabit/rabit-learn/linear/.gitignore
+++ b/subtree/rabit/rabit-learn/linear/.gitignore
@@ -0,0 +1,2 @@
+mushroom.row*
+*.model
--- a/subtree/rabit/rabit-learn/linear/Makefile
+++ b/subtree/rabit/rabit-learn/linear/Makefile
@@ -6,7 +6,8 @@ MPIBIN =
 OBJ = linear.o

 # common build script for programs
-include ../common.mk
+include ../make/config.mk
+include ../make/common.mk
 CFLAGS+=-fopenmp
 linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
 # dependenies here
--- a/subtree/rabit/rabit-learn/linear/README.md
+++ b/subtree/rabit/rabit-learn/linear/README.md
@@ -2,11 +2,24 @@ Linear and Logistic Regression
 ====
 * input format: LibSVM
 * Local Example: [run-linear.sh](run-linear.sh)
-* Runnig on Hadoop: [run-hadoop.sh](run-hadoop.sh)
-  - Set input data to stdin, and model_out=stdout
-    
+* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
+  - You will need to have YARN 
+  - Modify  ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
+  - Run build.sh on [../../yarn](../../yarn) on to build yarn jar file 
+
+Multi-Threading Optimization
+====
+* The code can be  multi-threaded, we encourage you to use it
+  - Simply add ```nthread=k``` where k is the number of threads you want to use
+* If you submit with YARN 
+  - Use ```--vcores``` and ```-mem``` to request CPU and memory resources
+  - Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
+* Usually multi-threading improves speed in general
+  - You can use less workers and assign more resources to each of worker
+  - This usually means less communication overhead and faster running time
+
 Parameters
-===
+====
 All the parameters can be set by param=value

 #### Important Parameters
--- a/subtree/rabit/rabit-learn/linear/linear.cc
+++ b/subtree/rabit/rabit-learn/linear/linear.cc
@@ -1,6 +1,5 @@
 #include "./linear.h"
-#include "../utils/io.h"
-#include "../utils/base64.h"
+#include "../io/io.h"

 namespace rabit {
 namespace linear {
@@ -55,7 +54,9 @@ class LinearObjFunction : public solver::IObjFunction<float> {
    }
    if (task == "train") {
      lbfgs.Run();
-      this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
+      if (rabit::GetRank() == 0) {
+        this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
+      }
    } else if (task == "pred") {
      this->TaskPred();
    } else {
@@ -74,51 +75,37 @@ class LinearObjFunction : public solver::IObjFunction<float> {
    printf("Finishing writing to %s\n", name_pred.c_str());
  }
  inline void LoadModel(const char *fname) {
-    FILE *fp = utils::FopenCheck(fname, "rb");
+    IStream *fi = io::CreateStream(fname, "r");
    std::string header; header.resize(4);
    // check header for different binary encode
    // can be base64 or binary
-    utils::FileStream fi(fp);
-    utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
-      // base64 format
+    utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
+    // base64 format
    if (header == "bs64") {
-      utils::Base64InStream bsin(fp);
+      io::Base64InStream bsin(fi);
      bsin.InitPosition();
      model.Load(bsin);
-      fclose(fp);
-      return;
    } else if (header == "binf") {
-      model.Load(fi);
-      fclose(fp);
-      return;     
+      model.Load(*fi);
    } else {
      utils::Error("invalid model file");
    }
+    delete fi;
  }
  inline void SaveModel(const char *fname,
                        const float *wptr,
                        bool save_base64 = false) {
-    FILE *fp;
-    bool use_stdout = false;
-    if (!strcmp(fname, "stdout")) {
-      fp = stdout;
-      use_stdout = true;
-    } else {
-      fp = utils::FopenCheck(fname, "wb");
-   }
-    utils::FileStream fo(fp);
-    if (save_base64 != 0|| use_stdout) {
-      fo.Write("bs64\t", 5);
-      utils::Base64OutStream bout(fp);
+    IStream *fo = io::CreateStream(fname, "w");
+    if (save_base64 != 0 || !strcmp(fname, "stdout")) {
+      fo->Write("bs64\t", 5);
+      io::Base64OutStream bout(fo);
      model.Save(bout, wptr);
      bout.Finish('\n');
    } else {
-      fo.Write("binf", 4);
-      model.Save(fo, wptr);
-    }
-    if (!use_stdout) {
-      fclose(fp);
+      fo->Write("binf", 4);
+      model.Save(*fo, wptr);
    }
+    delete fo;
  }
  inline void LoadData(const char *fname) {
    dtrain.Load(fname);
--- a/subtree/rabit/rabit-learn/linear/run-hadoop-old.sh
+++ b/subtree/rabit/rabit-learn/linear/run-hadoop-old.sh
@@ -12,7 +12,7 @@ hadoop fs -mkdir $2/data
 hadoop fs -put ../data/agaricus.txt.train $2/data

 # submit to hadoop
-../../tracker/rabit_hadoop.py --host_ip ip  -n $1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}" 
+../../tracker/rabit_hadoop_streaming.py  -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}" 

 # get the final model file
 hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
--- a/subtree/rabit/rabit-learn/linear/run-linear-mock.sh
+++ b/subtree/rabit/rabit-learn/linear/run-linear-mock.sh
@@ -5,11 +5,7 @@ then
    exit -1
 fi

-rm -rf mushroom.row* *.model
+rm -rf *.model
 k=$1

-# split the lib svm file into k subfiles
-python splitrows.py ../data/agaricus.txt.train mushroom $k
-
-# run xgboost mpi
-../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0  mock=0,2,1,1
+../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train  "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0  mock=0,2,1,1
--- a/subtree/rabit/rabit-learn/linear/run-linear.sh
+++ b/subtree/rabit/rabit-learn/linear/run-linear.sh
@@ -5,13 +5,10 @@ then
    exit -1
 fi

-rm -rf mushroom.row* *.model
+rm -rf *.model
 k=$1

-# split the lib svm file into k subfiles
-python splitrows.py ../data/agaricus.txt.train mushroom $k
-
-# run xgboost mpi
-../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
+# run linear model, the program will automatically split the inputs
+../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1 

 ./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model
--- a/subtree/rabit/rabit-learn/linear/run-yarn.sh
+++ b/subtree/rabit/rabit-learn/linear/run-yarn.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -lt 3 ];
+then
+	echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
+	exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -rm -r -f $2/data
+hadoop fs -rm -r -f $2/mushroom.linear.model
+hadoop fs -mkdir $2/data
+
+# submit to hadoop
+../../tracker/rabit_yarn.py  -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}" 
+
+# get the final model file
+hadoop fs -get $2/mushroom.linear.model ./linear.model
+
+./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
--- a/subtree/rabit/rabit-learn/linear/splitrows.py
+++ b/subtree/rabit/rabit-learn/linear/splitrows.py
@@ -1,24 +0,0 @@
-#!/usr/bin/python
-import sys
-import random
-
-# split libsvm file into different rows
-if len(sys.argv) < 4:
-    print ('Usage:<fin> <fo> k')
-    exit(0)
-
-random.seed(10)
-
-k = int(sys.argv[3])
-fi = open( sys.argv[1], 'r' )
-fos = []
-
-for i in range(k):
-    fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
-    
-for l in open(sys.argv[1]):
-    i = random.randint(0, k-1)
-    fos[i].write(l)
-
-for f in fos:    
-    f.close()
--- a/subtree/rabit/rabit-learn/make/common.mk
+++ b/subtree/rabit/rabit-learn/make/common.mk
@@ -1,13 +1,20 @@
 # this is the common build script for rabit programs
-# you do not have to use it 
-export CC  = gcc
-export CXX = g++
-export MPICXX = mpicxx
-export LDFLAGS= -pthread -lm -L../../lib
-export CFLAGS = -Wall  -msse2  -Wno-unknown-pragmas -fPIC -I../../include
+# you do not have to use it
+export LDFLAGS= -L../../lib -pthread -lm -lrt
+export CFLAGS = -Wall  -msse2  -Wno-unknown-pragmas -fPIC -I../../include  
+
+# setup opencv
+ifeq ($(USE_HDFS),1)
+	CFLAGS+= -DRABIT_USE_HDFS=1 -I$(HADOOP_HDFS_HOME)/include -I$(JAVA_HOME)/include
+	LDFLAGS+= -L$(HADOOP_HDFS_HOME)/lib/native -L$(LIBJVM) -lhdfs -ljvm
+else
+	CFLAGS+= -DRABIT_USE_HDFS=0
+endif

 .PHONY: clean all lib mpi
+
 all: $(BIN) $(MOCKBIN)
+
 mpi: $(MPIBIN)

 lib:
@@ -15,10 +22,12 @@ lib:
 libmpi:
 	cd ../..;make lib/librabit_mpi.a;cd -

+
 $(BIN) : 
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc,  $^) -lrabit $(LDFLAGS) 
+
 $(MOCKBIN) : 
-	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc,  $^) $(LDFLAGS) -lrabit_mock
+	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc,  $^) -lrabit_mock $(LDFLAGS) 

 $(OBJ) : 
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
--- a/subtree/rabit/rabit-learn/make/config.mk
+++ b/subtree/rabit/rabit-learn/make/config.mk
@@ -0,0 +1,21 @@
+#-----------------------------------------------------
+#  rabit-learn: the configuration compile script
+#
+#  This is the default configuration setup for rabit-learn
+#  If you want to change configuration, do the following steps:
+#
+#  - copy this file to the root of rabit-learn folder
+#  - modify the configuration you want
+#  - type make or make -j n for parallel build
+#----------------------------------------------------
+
+# choice of compiler
+export CC = gcc
+export CXX = g++
+export MPICXX = mpicxx
+
+# whether use HDFS support during compile
+USE_HDFS = 1
+
+# path to libjvm.so
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
--- a/subtree/rabit/rabit-learn/utils/data.h
+++ b/subtree/rabit/rabit-learn/utils/data.h
@@ -14,7 +14,9 @@
 #include <cstring>
 #include <limits>
 #include <cmath>
+#include <sstream>
 #include <rabit.h>
+#include "../io/io.h"

 namespace rabit {
 // typedef index type
@@ -45,49 +47,37 @@ struct SparseMat {
  }
  // load data from LibSVM format
  inline void Load(const char *fname) {
-    FILE *fi;
-    if (!strcmp(fname, "stdin")) {
-      fi = stdin;
-    } else {
-      if (strchr(fname, '%') != NULL) {
-        char s_tmp[256];
-        snprintf(s_tmp, sizeof(s_tmp), fname, rabit::GetRank());
-        fi = utils::FopenCheck(s_tmp, "r");        
-      } else {
-        fi = utils::FopenCheck(fname, "r");
-      }
-    }
+    io::InputSplit *in =
+        io::CreateInputSplit
+        (fname, rabit::GetRank(),
+         rabit::GetWorldSize());
    row_ptr.clear();
    row_ptr.push_back(0);
    data.clear();    
    feat_dim = 0;
-    float label; bool init = true;
-    char tmp[1024];
-    while (fscanf(fi, "%s", tmp) == 1) {
+    std::string line;
+    while (in->NextLine(&line)) {
+      float label;
+      std::istringstream ss(line);
+      ss >> label;
      Entry e;
      unsigned long fidx;
-      if (sscanf(tmp, "%lu:%f", &fidx, &e.fvalue) == 2) {
+      while (!ss.eof()) {
+        if (!(ss >> fidx)) break;
+        ss.ignore(32, ':');
+        if (!(ss >> e.fvalue)) break;
        e.findex = static_cast<index_t>(fidx);
        data.push_back(e);
        feat_dim = std::max(fidx, feat_dim);
-      } else {
-        if (!init) {
-          labels.push_back(label);
-          row_ptr.push_back(data.size());
-        }
-        utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format");
-        init = false;
      }
+      labels.push_back(label);
+      row_ptr.push_back(data.size());
    }
-    // last row
-    labels.push_back(label);
-    row_ptr.push_back(data.size());
+    delete in;
    feat_dim += 1;
    utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
                 "feature dimension exceed limit of index_t"\
                 "consider change the index_t to unsigned long");
-    // close the filed
-    if (fi != stdin) fclose(fi);
  }
  inline size_t NumRow(void) const {
    return row_ptr.size() - 1;
@@ -98,6 +88,7 @@ struct SparseMat {
  std::vector<Entry> data;
  std::vector<float> labels;
 };
+
 // dense matrix
 struct Matrix {
  inline void Init(size_t nrow, size_t ncol, float v = 0.0f) {
--- a/subtree/rabit/rabit-learn/utils/io.h
+++ b/subtree/rabit/rabit-learn/utils/io.h
@@ -1,40 +0,0 @@
-#ifndef RABIT_LEARN_UTILS_IO_H_
-#define RABIT_LEARN_UTILS_IO_H_
-/*!
- * \file io.h
- * \brief additional stream interface
- * \author Tianqi Chen
- */
-namespace rabit {
-namespace utils {
-/*! \brief implementation of file i/o stream */
-class FileStream : public ISeekStream {
- public:
-  explicit FileStream(FILE *fp) : fp(fp) {}
-  explicit FileStream(void) {
-    this->fp = NULL;
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return std::fread(ptr, size, 1, fp);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    std::fwrite(ptr, size, 1, fp);
-  }
-  virtual void Seek(size_t pos) {
-    std::fseek(fp, static_cast<long>(pos), SEEK_SET);
-  }
-  virtual size_t Tell(void) {
-    return std::ftell(fp);
-  }
-  inline void Close(void) {
-    if (fp != NULL){
-      std::fclose(fp); fp = NULL;
-    }
-  }
-
- private:
-  FILE *fp;
-};
-}  // namespace utils
-}  // namespace rabit
-#endif  // RABIT_LEARN_UTILS_IO_H_