Merge commit '13776a006a4e572720ec4c5b029b54771cf2b35c' into unity

2015-02-11 20:33:35 -08:00
parent c40afa2023 13776a006a
commit e923bdb12f
21 changed files with 9698 additions and 22 deletions
--- a/subtree/rabit/rabit-learn/linear/Makefile
+++ b/subtree/rabit/rabit-learn/linear/Makefile
@@ -0,0 +1,14 @@
+# specify tensor path
+BIN = linear.rabit
+MOCKBIN= linear.mock
+MPIBIN = 
+# objectives that makes up rabit library
+OBJ = linear.o
+
+# common build script for programs
+include ../common.mk
+CFLAGS+=-fopenmp
+linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
+# dependenies here
+linear.rabit: linear.o lib
+linear.mock: linear.o lib
--- a/subtree/rabit/rabit-learn/linear/README.md
+++ b/subtree/rabit/rabit-learn/linear/README.md
@@ -0,0 +1,33 @@
+Linear and Logistic Regression
+====
+* input format: LibSVM
+* Example: [run-linear.sh](run-linear.sh)
+
+Parameters
+===
+All the parameters can be set by param=value
+
+#### Important Parameters
+* objective [default = logistic]
+  - can be linear or logistic
+* base_score [default = 0.5]
+  - global bias, recommended set to mean value of label
+* reg_L1 [default = 0]
+  - l1 regularization co-efficient
+* reg_L2 [default = 1]
+  - l2 regularization co-efficient
+* lbfgs_stop_tol [default = 1e-5]
+  - relative tolerance level of loss reduction with respect to initial loss
+* max_lbfgs_iter [default = 500]
+  - maximum number of lbfgs iterations
+
+### Optimization Related parameters
+* min_lbfgs_iter [default = 5]
+  - minimum number of lbfgs iterations
+* max_linesearch_iter [default = 100] 
+  - maximum number of iterations in linesearch
+* linesearch_c1 [default = 1e-4] 
+  - c1 co-efficient in backoff linesearch
+* linesarch_backoff [default = 0.5]
+  - backoff ratio in linesearch
+ 
--- a/subtree/rabit/rabit-learn/linear/linear.cc
+++ b/subtree/rabit/rabit-learn/linear/linear.cc
@@ -0,0 +1,239 @@
+#include "./linear.h"
+#include "../utils/io.h"
+#include "../utils/base64.h"
+
+namespace rabit {
+namespace linear {
+class LinearObjFunction : public solver::IObjFunction<float> {
+ public:
+  // training threads
+  int nthread;
+  // L2 regularization
+  float reg_L2;
+  // model
+  LinearModel model;
+  // training data
+  SparseMat dtrain;
+  // solver
+  solver::LBFGSSolver<float> lbfgs;
+  // constructor
+  LinearObjFunction(void) {
+    lbfgs.SetObjFunction(this);
+    nthread = 1;
+    reg_L2 = 0.0f;
+    model.weight = NULL;
+    task = "train";
+    model_in = "NULL";
+    name_pred = "pred.txt";
+    model_out = "final.model";
+  }
+  virtual ~LinearObjFunction(void) {
+  }
+  // set parameters
+  inline void SetParam(const char *name, const char *val) {
+    model.param.SetParam(name, val);
+    lbfgs.SetParam(name, val);
+    if (!strcmp(name, "num_feature")) {
+      char ndigit[30];
+      sprintf(ndigit, "%lu", model.param.num_feature + 1);
+      lbfgs.SetParam("num_dim", ndigit);
+    }
+    if (!strcmp(name, "reg_L2")) {
+      reg_L2 = static_cast<float>(atof(val));
+    }
+    if (!strcmp(name, "nthread")) {
+      nthread = atoi(val);
+    }
+    if (!strcmp(name, "task")) task = val;
+    if (!strcmp(name, "model_in")) model_in = val;
+    if (!strcmp(name, "model_out")) model_out = val;
+    if (!strcmp(name, "name_pred")) name_pred = val;
+  }
+  inline void Run(void) {
+    if (model_in != "NULL") {
+      this->LoadModel(model_in.c_str());
+    }
+    if (task == "train") {
+      lbfgs.Run();
+      this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
+    } else if (task == "pred") {
+      this->TaskPred();
+    } else {
+      utils::Error("unknown task=%s", task.c_str());
+    }
+  }
+  inline void TaskPred(void) {
+    utils::Check(model_in != "NULL",
+                 "must set model_in for task=pred");
+    FILE *fp = utils::FopenCheck(name_pred.c_str(), "w");
+    for (size_t i = 0; i < dtrain.NumRow(); ++i) {
+      float pred = model.Predict(dtrain[i]);
+      fprintf(fp, "%g\n", pred);
+    }
+    fclose(fp);
+    printf("Finishing writing to %s\n", name_pred.c_str());
+  }
+  inline void LoadModel(const char *fname) {
+    FILE *fp = utils::FopenCheck(fname, "rb");
+    std::string header; header.resize(4);
+    // check header for different binary encode
+    // can be base64 or binary
+    utils::FileStream fi(fp);
+    utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
+      // base64 format
+    if (header == "bs64") {
+      utils::Base64InStream bsin(fp);
+      bsin.InitPosition();
+      model.Load(bsin);
+      fclose(fp);
+      return;
+    } else if (header == "binf") {
+      model.Load(fi);
+      fclose(fp);
+      return;     
+    } else {
+      utils::Error("invalid model file");
+    }
+  }
+  inline void SaveModel(const char *fname,
+                        const float *wptr,
+                        bool save_base64 = false) {
+    FILE *fp;
+    bool use_stdout = false;
+    if (!strcmp(fname, "stdout")) {
+      fp = stdout;
+      use_stdout = true;
+    } else {
+      fp = utils::FopenCheck(fname, "wb");
+   }
+    utils::FileStream fo(fp);
+    if (save_base64 != 0|| use_stdout) {
+      fo.Write("bs64\t", 5);
+      utils::Base64OutStream bout(fp);
+      model.Save(bout, wptr);
+      bout.Finish('\n');
+    } else {
+      fo.Write("binf", 4);
+      model.Save(fo, wptr);
+    }
+    if (!use_stdout) {
+      fclose(fp);
+    }
+  }
+  inline void LoadData(const char *fname) {
+    dtrain.Load(fname);
+  }
+  virtual size_t InitNumDim(void)  {
+    if (model_in == "NULL") {
+      size_t ndim = dtrain.feat_dim;
+      rabit::Allreduce<rabit::op::Max>(&ndim, 1);
+      model.param.num_feature = std::max(ndim, model.param.num_feature);
+    }
+    return model.param.num_feature + 1;
+  }
+  virtual void InitModel(float *weight, size_t size) {
+    if (model_in == "NULL") {
+      memset(weight, 0.0f, size * sizeof(float));
+      model.param.InitBaseScore();
+    } else {
+      rabit::Broadcast(model.weight, size * sizeof(float), 0);
+      memcpy(weight, model.weight, size * sizeof(float));
+    }
+  }
+  // load model
+  virtual void Load(rabit::IStream &fi) {
+    fi.Read(&model.param, sizeof(model.param));
+  }
+  virtual void Save(rabit::IStream &fo) const {
+    fo.Write(&model.param, sizeof(model.param));
+  }
+  virtual double Eval(const float *weight, size_t size) {
+   if (nthread != 0) omp_set_num_threads(nthread);
+    utils::Check(size == model.param.num_feature + 1,
+                 "size consistency check");
+    double sum_val = 0.0;
+    #pragma omp parallel for schedule(static) reduction(+:sum_val)
+    for (size_t i = 0; i < dtrain.NumRow(); ++i) {
+      float py = model.param.PredictMargin(weight, dtrain[i]);
+      float fv = model.param.MarginToLoss(dtrain.labels[i], py);
+      sum_val += fv;
+    }
+    if (rabit::GetRank() == 0) {
+      // only add regularization once
+      if (reg_L2 != 0.0f) {
+        double sum_sqr = 0.0;
+        for (size_t i = 0; i < model.param.num_feature; ++i) {
+          sum_sqr += weight[i] * weight[i];
+        }
+        sum_val += 0.5 * reg_L2 * sum_sqr;        
+      }
+    }
+    utils::Check(!std::isnan(sum_val), "nan occurs");
+    return sum_val;
+  }
+  virtual void CalcGrad(float *out_grad,
+                        const float *weight,
+                        size_t size) {
+   if (nthread != 0) omp_set_num_threads(nthread);
+   utils::Check(size == model.param.num_feature + 1,
+                 "size consistency check");
+    memset(out_grad, 0.0f, sizeof(float) * size);
+    double sum_gbias = 0.0;    
+    #pragma omp parallel for schedule(static) reduction(+:sum_gbias)
+    for (size_t i = 0; i < dtrain.NumRow(); ++i) {
+      SparseMat::Vector v = dtrain[i];
+      float py = model.param.Predict(weight, v);
+      float grad = model.param.PredToGrad(dtrain.labels[i], py);
+      for (index_t j = 0; j < v.length; ++j) {
+        out_grad[v[j].findex] += v[j].fvalue * grad;
+      }
+      sum_gbias += grad;
+    }
+    out_grad[model.param.num_feature] = static_cast<float>(sum_gbias);
+    if (rabit::GetRank() == 0) {
+      // only add regularization once
+      if (reg_L2 != 0.0f) {
+        for (size_t i = 0; i < model.param.num_feature; ++i) {
+          out_grad[i] += reg_L2 * weight[i];
+        }
+      }
+    }
+  }
+    
+ private:
+  std::string task;
+  std::string model_in;
+  std::string model_out;
+  std::string name_pred;
+};
+}  // namespace linear
+}  // namespace rabit
+
+int main(int argc, char *argv[]) {
+  if (argc < 2) {
+    // intialize rabit engine
+    rabit::Init(argc, argv);
+    if (rabit::GetRank() == 0) {
+      rabit::TrackerPrintf("Usage: <data_in> param=val\n");
+    }
+    rabit::Finalize();
+    return 0;
+  }
+  rabit::linear::LinearObjFunction linear;
+  if (!strcmp(argv[1], "stdin")) {
+    linear.LoadData(argv[1]);
+    rabit::Init(argc, argv);
+  } else {
+    rabit::Init(argc, argv);
+    linear.LoadData(argv[1]);
+  }
+  for (int i = 2; i < argc; ++i) {
+    char name[256], val[256];
+    if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
+      linear.SetParam(name, val);
+    }
+  }
+  linear.Run();
+  rabit::Finalize();
+  return 0;
+}
--- a/subtree/rabit/rabit-learn/linear/linear.h
+++ b/subtree/rabit/rabit-learn/linear/linear.h
@@ -0,0 +1,133 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file linear.h
+ * \brief Linear and Logistic regression
+ *
+ * \author Tianqi Chen
+ */
+#ifndef RABIT_LINEAR_H_
+#define RABIT_LINEAR_H_
+#include <omp.h>
+#include "../utils/data.h"
+#include "../solver/lbfgs.h"
+
+namespace rabit {
+namespace linear {
+/*! \brief simple linear model */
+struct LinearModel {
+  struct ModelParam {
+    /*! \brief global bias */
+    float base_score;
+    /*! \brief number of features  */
+    size_t num_feature;
+    /*! \brief loss type*/
+    int loss_type;
+    // reserved field
+    int reserved[16];
+    // constructor
+    ModelParam(void) {
+      base_score = 0.5f;
+      num_feature = 0;
+      loss_type = 1;
+      std::memset(reserved, 0, sizeof(reserved));
+    }
+    // initialize base score
+    inline void InitBaseScore(void) {
+      utils::Check(base_score > 0.0f && base_score < 1.0f,
+                   "base_score must be in (0,1) for logistic loss");
+      base_score = -std::log(1.0f / base_score - 1.0f);      
+    }
+    /*!
+     * \brief set parameters from outside
+     * \param name name of the parameter
+     * \param val value of the parameter
+     */    
+    inline void SetParam(const char *name, const char *val) {
+      using namespace std;
+      if (!strcmp("base_score", name)) {
+        base_score = static_cast<float>(atof(val));
+      }
+      if (!strcmp("num_feature", name)) {
+        num_feature = static_cast<size_t>(atol(val));
+      }
+      if (!strcmp("objective", name)) {
+        if (!strcmp("linear", val)) {
+          loss_type = 0;
+        } else if (!strcmp("logistic", val)) {
+          loss_type = 1;
+        } else {
+          utils::Error("unknown objective type %s\n", val);
+        }
+      }
+    }
+    // transform margin to prediction
+    inline float MarginToPred(float margin) const {
+      if (loss_type == 1) {
+        return 1.0f / (1.0f + std::exp(-margin));
+      } else {
+        return margin;
+      }
+    }
+    // margin to loss
+    inline float MarginToLoss(float label, float margin) const {
+      if (loss_type == 1) {
+        float nlogprob;
+        if (margin > 0.0f) {
+          nlogprob = std::log(1.0f + std::exp(-margin));
+        } else {
+          nlogprob = -margin + std::log(1.0f + std::exp(margin));
+        }
+        return label * nlogprob +
+            (1.0f -label) * (margin + nlogprob); 
+      } else {
+        float diff = margin - label;
+        return 0.5f * diff * diff;
+      }
+    }
+    inline float PredToGrad(float label, float pred) const {
+      return pred - label;      
+    }
+    inline float PredictMargin(const float *weight,
+                               const SparseMat::Vector &v) const {
+      // weight[num_feature] is bias
+      float sum = base_score + weight[num_feature];
+      for (unsigned i = 0; i < v.length; ++i) {
+        if (v[i].findex >= num_feature) continue;
+        sum += weight[v[i].findex] * v[i].fvalue;
+      }    
+      return sum;
+    }
+    inline float Predict(const float *weight,
+                         const SparseMat::Vector &v) const {
+      return MarginToPred(PredictMargin(weight, v));
+    }
+  };
+  // model parameter
+  ModelParam param;
+  // weight corresponding to the model
+  float *weight;
+  LinearModel(void) : weight(NULL) {
+  }
+  ~LinearModel(void) {
+    if (weight != NULL) delete [] weight;
+  }
+  // load model
+  inline void Load(rabit::IStream &fi) {
+    fi.Read(&param, sizeof(param));
+    if (weight == NULL) {
+      weight = new float[param.num_feature + 1];
+    }
+    fi.Read(weight, sizeof(float) * (param.num_feature + 1));
+  }
+  inline void Save(rabit::IStream &fo, const float *wptr = NULL) const {
+    fo.Write(&param, sizeof(param));
+    if (wptr == NULL) wptr = weight;
+    fo.Write(wptr, sizeof(float) * (param.num_feature + 1));
+  }
+  inline float Predict(const SparseMat::Vector &v) const {
+    return param.Predict(weight, v);
+  }
+};
+}  // namespace linear
+}  // namespace rabit
+#endif // RABIT_LINEAR_H_
--- a/subtree/rabit/rabit-learn/linear/run-linear-mock.sh
+++ b/subtree/rabit/rabit-learn/linear/run-linear-mock.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+if [[ $# -lt 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf mushroom.row* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitrows.py ../data/agaricus.txt.train mushroom $k
+
+# run xgboost mpi
+../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0  mock=0,2,1,1
--- a/subtree/rabit/rabit-learn/linear/run-linear.sh
+++ b/subtree/rabit/rabit-learn/linear/run-linear.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+if [[ $# -lt 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf mushroom.row* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitrows.py ../data/agaricus.txt.train mushroom $k
+
+# run xgboost mpi
+../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
+
+./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model
--- a/subtree/rabit/rabit-learn/linear/splitrows.py
+++ b/subtree/rabit/rabit-learn/linear/splitrows.py
@@ -0,0 +1,24 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different rows
+if len(sys.argv) < 4:
+    print ('Usage:<fin> <fo> k')
+    exit(0)
+
+random.seed(10)
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+    fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
+    
+for l in open(sys.argv[1]):
+    i = random.randint(0, k-1)
+    fos[i].write(l)
+
+for f in fos:    
+    f.close()