Merge commit '57b5d7873f4f0953357e9d98e9c60cff8373d7ec'

2015-03-09 13:28:38 -07:00
parent 66cf88f7b0 57b5d7873f
commit 9f7c6fe271
43 changed files with 1797 additions and 235 deletions
--- a/subtree/rabit/rabit-learn/linear/.gitignore
+++ b/subtree/rabit/rabit-learn/linear/.gitignore
@@ -0,0 +1,2 @@
+mushroom.row*
+*.model
--- a/subtree/rabit/rabit-learn/linear/Makefile
+++ b/subtree/rabit/rabit-learn/linear/Makefile
@@ -6,7 +6,8 @@ MPIBIN =
 OBJ = linear.o

 # common build script for programs
-include ../common.mk
+include ../make/config.mk
+include ../make/common.mk
 CFLAGS+=-fopenmp
 linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
 # dependenies here
--- a/subtree/rabit/rabit-learn/linear/README.md
+++ b/subtree/rabit/rabit-learn/linear/README.md
@@ -2,11 +2,24 @@ Linear and Logistic Regression
 ====
 * input format: LibSVM
 * Local Example: [run-linear.sh](run-linear.sh)
-* Runnig on Hadoop: [run-hadoop.sh](run-hadoop.sh)
-  - Set input data to stdin, and model_out=stdout
-    
+* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
+  - You will need to have YARN 
+  - Modify  ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
+  - Run build.sh on [../../yarn](../../yarn) on to build yarn jar file 
+
+Multi-Threading Optimization
+====
+* The code can be  multi-threaded, we encourage you to use it
+  - Simply add ```nthread=k``` where k is the number of threads you want to use
+* If you submit with YARN 
+  - Use ```--vcores``` and ```-mem``` to request CPU and memory resources
+  - Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
+* Usually multi-threading improves speed in general
+  - You can use less workers and assign more resources to each of worker
+  - This usually means less communication overhead and faster running time
+
 Parameters
-===
+====
 All the parameters can be set by param=value

 #### Important Parameters
--- a/subtree/rabit/rabit-learn/linear/linear.cc
+++ b/subtree/rabit/rabit-learn/linear/linear.cc
@@ -1,6 +1,5 @@
 #include "./linear.h"
-#include "../utils/io.h"
-#include "../utils/base64.h"
+#include "../io/io.h"

 namespace rabit {
 namespace linear {
@@ -55,7 +54,9 @@ class LinearObjFunction : public solver::IObjFunction<float> {
    }
    if (task == "train") {
      lbfgs.Run();
-      this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
+      if (rabit::GetRank() == 0) {
+        this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
+      }
    } else if (task == "pred") {
      this->TaskPred();
    } else {
@@ -74,51 +75,37 @@ class LinearObjFunction : public solver::IObjFunction<float> {
    printf("Finishing writing to %s\n", name_pred.c_str());
  }
  inline void LoadModel(const char *fname) {
-    FILE *fp = utils::FopenCheck(fname, "rb");
+    IStream *fi = io::CreateStream(fname, "r");
    std::string header; header.resize(4);
    // check header for different binary encode
    // can be base64 or binary
-    utils::FileStream fi(fp);
-    utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
-      // base64 format
+    utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
+    // base64 format
    if (header == "bs64") {
-      utils::Base64InStream bsin(fp);
+      io::Base64InStream bsin(fi);
      bsin.InitPosition();
      model.Load(bsin);
-      fclose(fp);
-      return;
    } else if (header == "binf") {
-      model.Load(fi);
-      fclose(fp);
-      return;     
+      model.Load(*fi);
    } else {
      utils::Error("invalid model file");
    }
+    delete fi;
  }
  inline void SaveModel(const char *fname,
                        const float *wptr,
                        bool save_base64 = false) {
-    FILE *fp;
-    bool use_stdout = false;
-    if (!strcmp(fname, "stdout")) {
-      fp = stdout;
-      use_stdout = true;
-    } else {
-      fp = utils::FopenCheck(fname, "wb");
-   }
-    utils::FileStream fo(fp);
-    if (save_base64 != 0|| use_stdout) {
-      fo.Write("bs64\t", 5);
-      utils::Base64OutStream bout(fp);
+    IStream *fo = io::CreateStream(fname, "w");
+    if (save_base64 != 0 || !strcmp(fname, "stdout")) {
+      fo->Write("bs64\t", 5);
+      io::Base64OutStream bout(fo);
      model.Save(bout, wptr);
      bout.Finish('\n');
    } else {
-      fo.Write("binf", 4);
-      model.Save(fo, wptr);
-    }
-    if (!use_stdout) {
-      fclose(fp);
+      fo->Write("binf", 4);
+      model.Save(*fo, wptr);
    }
+    delete fo;
  }
  inline void LoadData(const char *fname) {
    dtrain.Load(fname);
--- a/subtree/rabit/rabit-learn/linear/run-hadoop-old.sh
+++ b/subtree/rabit/rabit-learn/linear/run-hadoop-old.sh
@@ -12,7 +12,7 @@ hadoop fs -mkdir $2/data
 hadoop fs -put ../data/agaricus.txt.train $2/data

 # submit to hadoop
-../../tracker/rabit_hadoop.py --host_ip ip  -n $1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}" 
+../../tracker/rabit_hadoop_streaming.py  -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}" 

 # get the final model file
 hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
--- a/subtree/rabit/rabit-learn/linear/run-linear-mock.sh
+++ b/subtree/rabit/rabit-learn/linear/run-linear-mock.sh
@@ -5,11 +5,7 @@ then
    exit -1
 fi

-rm -rf mushroom.row* *.model
+rm -rf *.model
 k=$1

-# split the lib svm file into k subfiles
-python splitrows.py ../data/agaricus.txt.train mushroom $k
-
-# run xgboost mpi
-../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0  mock=0,2,1,1
+../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train  "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0  mock=0,2,1,1
--- a/subtree/rabit/rabit-learn/linear/run-linear.sh
+++ b/subtree/rabit/rabit-learn/linear/run-linear.sh
@@ -5,13 +5,10 @@ then
    exit -1
 fi

-rm -rf mushroom.row* *.model
+rm -rf *.model
 k=$1

-# split the lib svm file into k subfiles
-python splitrows.py ../data/agaricus.txt.train mushroom $k
-
-# run xgboost mpi
-../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
+# run linear model, the program will automatically split the inputs
+../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1 

 ./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model
--- a/subtree/rabit/rabit-learn/linear/run-yarn.sh
+++ b/subtree/rabit/rabit-learn/linear/run-yarn.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -lt 3 ];
+then
+	echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
+	exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -rm -r -f $2/data
+hadoop fs -rm -r -f $2/mushroom.linear.model
+hadoop fs -mkdir $2/data
+
+# submit to hadoop
+../../tracker/rabit_yarn.py  -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}" 
+
+# get the final model file
+hadoop fs -get $2/mushroom.linear.model ./linear.model
+
+./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
--- a/subtree/rabit/rabit-learn/linear/splitrows.py
+++ b/subtree/rabit/rabit-learn/linear/splitrows.py
@@ -1,24 +0,0 @@
-#!/usr/bin/python
-import sys
-import random
-
-# split libsvm file into different rows
-if len(sys.argv) < 4:
-    print ('Usage:<fin> <fo> k')
-    exit(0)
-
-random.seed(10)
-
-k = int(sys.argv[3])
-fi = open( sys.argv[1], 'r' )
-fos = []
-
-for i in range(k):
-    fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
-    
-for l in open(sys.argv[1]):
-    i = random.randint(0, k-1)
-    fos[i].write(l)
-
-for f in fos:    
-    f.close()