add all

2015-02-09 20:26:39 -08:00
parent 12ee049a74
commit 4a5b9e5f78
14 changed files with 8596 additions and 21 deletions
--- a/rabit-learn/utils/base64.h
+++ b/rabit-learn/utils/base64.h
@@ -0,0 +1,204 @@
+#ifndef RABIT_LEARN_UTILS_BASE64_H_
+#define RABIT_LEARN_UTILS_BASE64_H_
+/*!
+ * \file base64.h
+ * \brief data stream support to input and output from/to base64 stream
+ * base64 is easier to store and pass as text format in mapreduce
+ * \author Tianqi Chen
+ */
+#include <cctype>
+#include <cstdio>
+#include <rabit/io.h>
+
+namespace rabit {
+namespace utils {
+/*! \brief namespace of base64 decoding and encoding table */
+namespace base64 {
+const char DecodeTable[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  62,  // '+'
+  0, 0, 0,
+  63,  // '/'
+  52, 53, 54, 55, 56, 57, 58, 59, 60, 61,  // '0'-'9'
+  0, 0, 0, 0, 0, 0, 0,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+  13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'A'-'Z'
+  0, 0, 0, 0, 0, 0,
+  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+  39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  // 'a'-'z'
+};
+static const char EncodeTable[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+} // namespace base64
+/*! \brief the stream that reads from base64, note we take from file pointers */
+class Base64InStream: public IStream {
+ public:
+  explicit Base64InStream(FILE *fp) : fp(fp) {
+    num_prev = 0; tmp_ch = 0;
+  }
+  /*! 
+   * \brief initialize the stream position to beginning of next base64 stream 
+   * call this function before actually start read
+   */
+  inline void InitPosition(void) {
+    // get a charater
+    do {
+      tmp_ch = fgetc(fp);
+    } while (isspace(tmp_ch));
+  }
+  /*! \brief whether current position is end of a base64 stream */
+  inline bool IsEOF(void) const {
+    return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    using base64::DecodeTable;
+    if (size == 0) return 0;
+    // use tlen to record left size
+    size_t tlen = size;
+    unsigned char *cptr = static_cast<unsigned char*>(ptr);
+    // if anything left, load from previous buffered result
+    if (num_prev != 0) {
+      if (num_prev == 2) {
+        if (tlen >= 2) {
+          *cptr++ = buf_prev[0];
+          *cptr++ = buf_prev[1];
+          tlen -= 2;
+          num_prev = 0;
+        } else {
+          // assert tlen == 1
+          *cptr++ = buf_prev[0]; --tlen;
+          buf_prev[0] = buf_prev[1];
+          num_prev = 1;
+        }
+      } else {
+        // assert num_prev == 1
+        *cptr++ = buf_prev[0]; --tlen; num_prev = 0;
+      }
+    }
+    if (tlen == 0) return size;
+    int nvalue;
+    // note: everything goes with 4 bytes in Base64
+    // so we process 4 bytes a unit
+    while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
+      // first byte
+      nvalue = DecodeTable[tmp_ch] << 18;
+      {
+        // second byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        nvalue |= DecodeTable[tmp_ch] << 12;
+        *cptr++ = (nvalue >> 16) & 0xFF; --tlen;
+      }
+      {
+        // third byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        // handle termination
+        if (tmp_ch == '=') {
+          Check((tmp_ch = fgetc(fp), tmp_ch == '='), "invalid base64 format");
+          Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
+                "invalid base64 format");
+          break;
+        }
+        nvalue |= DecodeTable[tmp_ch] << 6;
+        if (tlen) {
+          *cptr++ = (nvalue >> 8) & 0xFF; --tlen;
+        } else {
+          buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
+        }
+      }
+      {
+        // fourth byte
+        Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
+              "invalid base64 format");
+        if (tmp_ch == '=') {
+          Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
+                "invalid base64 format");
+          break;
+        }
+        nvalue |= DecodeTable[tmp_ch];
+        if (tlen) {
+          *cptr++ = nvalue & 0xFF; --tlen;
+        } else {
+          buf_prev[num_prev ++] = nvalue & 0xFF;
+        }
+      }
+      // get next char
+      tmp_ch = fgetc(fp);
+    }
+    if (kStrictCheck) {
+      Check(tlen == 0, "Base64InStream: read incomplete");
+    }
+    return size - tlen;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    utils::Error("Base64InStream do not support write");
+  }
+
+ private:
+  FILE *fp;
+  int tmp_ch;
+  int num_prev;
+  unsigned char buf_prev[2];
+  // whether we need to do strict check
+  static const bool kStrictCheck = false;
+};
+/*! \brief the stream that write to base64, note we take from file pointers */
+class Base64OutStream: public IStream {
+ public:
+  explicit Base64OutStream(FILE *fp) : fp(fp) {
+    buf_top = 0;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    using base64::EncodeTable;
+    size_t tlen = size;
+    const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
+    while (tlen) {
+      while (buf_top < 3  && tlen != 0) {
+        buf[++buf_top] = *cptr++; --tlen;
+      }
+      if (buf_top == 3) {
+        // flush 4 bytes out
+        fputc(EncodeTable[buf[1] >> 2], fp);
+        fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
+        fputc(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F], fp);
+        fputc(EncodeTable[buf[3] & 0x3F], fp);
+        buf_top = 0;
+      }
+    }
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    Error("Base64OutStream do not support read");
+    return 0;
+  }
+  /*!
+   * \brief finish writing of all current base64 stream, do some post processing
+   * \param endch charater to put to end of stream, if it is EOF, then nothing will be done
+   */
+  inline void Finish(char endch = EOF) {
+    using base64::EncodeTable;
+    if (buf_top == 1) {
+      fputc(EncodeTable[buf[1] >> 2], fp);
+      fputc(EncodeTable[(buf[1] << 4) & 0x3F], fp);
+      fputc('=', fp);
+      fputc('=', fp);
+    }
+    if (buf_top == 2) {
+      fputc(EncodeTable[buf[1] >> 2], fp);
+      fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
+      fputc(EncodeTable[(buf[2] << 2) & 0x3F], fp);
+      fputc('=', fp);
+    }
+    buf_top = 0;
+    if (endch != EOF) fputc(endch, fp);
+  }
+
+ private:
+  FILE *fp;
+  int buf_top;
+  unsigned char buf[4];
+};
+}  // namespace utils
+}  // namespace rabit
+#endif  // RABIT_LEARN_UTILS_BASE64_H_
--- a/rabit-learn/utils/data.h
+++ b/rabit-learn/utils/data.h
@@ -0,0 +1,143 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file data.h
+ * \brief simple data structure that could be used by model
+ *
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_LEARN_DATA_H_
+#define RABIT_LEARN_DATA_H_
+
+#include <vector>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <limits>
+#include <cmath>
+#include <rabit.h>
+
+namespace rabit {
+// typedef index type
+typedef unsigned index_t;
+
+/*! \brief sparse matrix, CSR format */
+struct SparseMat {
+  // sparse matrix entry
+  struct Entry {
+    // feature index 
+    index_t findex;
+    // feature value
+    float fvalue;
+  };
+  // sparse vector
+  struct Vector {
+    const Entry *data;
+    index_t length;
+    inline const Entry &operator[](size_t i) const {
+      return data[i];
+    }
+  };
+  inline Vector operator[](size_t i) const {
+    Vector v;
+    v.data = &data[0] + row_ptr[i];
+    v.length = static_cast<index_t>(row_ptr[i + 1]-row_ptr[i]);
+    return v;
+  }
+  // load data from LibSVM format
+  inline void Load(const char *fname) {
+    FILE *fi;
+    if (!strcmp(fname, "stdin")) {
+      fi = stdin;
+    } else {
+      if (strchr(fname, '%') != NULL) {
+        char s_tmp[256];
+        snprintf(s_tmp, sizeof(s_tmp), fname, rabit::GetRank());
+        fi = utils::FopenCheck(s_tmp, "r");        
+      } else {
+        fi = utils::FopenCheck(fname, "r");
+      }
+    }
+    row_ptr.clear();
+    row_ptr.push_back(0);
+    data.clear();    
+    feat_dim = 0;
+    float label; bool init = true;
+    char tmp[1024];
+    while (fscanf(fi, "%s", tmp) == 1) {
+      Entry e;
+      unsigned long fidx;
+      if (sscanf(tmp, "%lu:%f", &fidx, &e.fvalue) == 2) {
+        e.findex = static_cast<index_t>(fidx);
+        data.push_back(e);
+        feat_dim = std::max(fidx, feat_dim);
+      } else {
+        if (!init) {
+          labels.push_back(label);
+          row_ptr.push_back(data.size());
+        }
+        utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format");
+        init = false;
+      }
+    }
+    // last row
+    labels.push_back(label);
+    row_ptr.push_back(data.size());
+    feat_dim += 1;
+    utils::Check(feat_dim < std::numeric_limits<index_t>::max(),
+                 "feature dimension exceed limit of index_t"\
+                 "consider change the index_t to unsigned long");
+    // close the filed
+    if (fi != stdin) fclose(fi);
+  }
+  inline size_t NumRow(void) const {
+    return row_ptr.size() - 1;
+  }
+  // maximum feature dimension
+  size_t feat_dim;
+  std::vector<size_t> row_ptr;
+  std::vector<Entry> data;
+  std::vector<float> labels;
+};
+// dense matrix
+struct Matrix {
+  inline void Init(size_t nrow, size_t ncol, float v = 0.0f) {
+    this->nrow = nrow;
+    this->ncol = ncol;
+    data.resize(nrow * ncol);
+    std::fill(data.begin(), data.end(), v);
+  }
+  inline float *operator[](size_t i) {
+    return &data[0] + i * ncol;
+  }
+  inline const float *operator[](size_t i) const {
+    return &data[0] + i * ncol;
+  }
+  inline void Print(const char *fname) {
+    FILE *fo;
+    if (!strcmp(fname, "stdout")) {
+      fo = stdout;
+    } else {
+      fo = utils::FopenCheck(fname, "w");
+    }
+    for (size_t i = 0; i < data.size(); ++i) {
+      fprintf(fo, "%g", data[i]);
+      if ((i+1) % ncol == 0) {
+        fprintf(fo, "\n");
+      } else {
+        fprintf(fo, " ");
+      }
+    }
+    // close the filed
+    if (fo != stdout) fclose(fo);
+  }
+  // number of data
+  size_t nrow, ncol;
+  std::vector<float> data;
+};
+
+/*!\brief computes a random number modulo the value */
+inline int Random(int value) {
+  return rand() % value;
+}
+} // namespace rabit
+#endif // RABIT_LEARN_DATA_H_
--- a/rabit-learn/utils/io.h
+++ b/rabit-learn/utils/io.h
@@ -0,0 +1,40 @@
+#ifndef RABIT_LEARN_UTILS_IO_H_
+#define RABIT_LEARN_UTILS_IO_H_
+/*!
+ * \file io.h
+ * \brief additional stream interface
+ * \author Tianqi Chen
+ */
+namespace rabit {
+namespace utils {
+/*! \brief implementation of file i/o stream */
+class FileStream : public ISeekStream {
+ public:
+  explicit FileStream(FILE *fp) : fp(fp) {}
+  explicit FileStream(void) {
+    this->fp = NULL;
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    return std::fread(ptr, size, 1, fp);
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    std::fwrite(ptr, size, 1, fp);
+  }
+  virtual void Seek(size_t pos) {
+    std::fseek(fp, static_cast<long>(pos), SEEK_SET);
+  }
+  virtual size_t Tell(void) {
+    return std::ftell(fp);
+  }
+  inline void Close(void) {
+    if (fp != NULL){
+      std::fclose(fp); fp = NULL;
+    }
+  }
+
+ private:
+  FILE *fp;
+};
+}  // namespace utils
+}  // namespace rabit
+#endif  // RABIT_LEARN_UTILS_IO_H_