Squashed 'subtree/rabit/' changes from d4ec037..28ca7be

28ca7be add linear readme ca4b20f add linear readme 1133628 add linear readme 6a11676 update docs a607047 Update build.sh 2c1cfd8 complete yarn 4f28e32 change formater 2fbda81 fix stdin input 3258bcf checkin yarn master 67ebf81 allow setup from env variables 9b6bf57 fix hdfs 395d5c2 add make system 88ce767 refactor io, initial hdfs file access need test 19be870 chgs a1bd3c6 Merge branch 'master' of ssh://github.com/tqchen/rabit 1a573f9 introduce input split 29476f1 fix timer issue git-subtree-dir: subtree/rabit git-subtree-split: 28ca7becbd
2015-03-09 13:28:38 -07:00
parent ef2de29f06
commit 57b5d7873f
43 changed files with 1797 additions and 235 deletions
--- a/rabit-learn/linear/.gitignore
+++ b/rabit-learn/linear/.gitignore
@@ -0,0 +1,2 @@
+mushroom.row*
+*.model
--- a/rabit-learn/linear/Makefile
+++ b/rabit-learn/linear/Makefile
@@ -6,7 +6,8 @@ MPIBIN =
 OBJ = linear.o

 # common build script for programs
-include ../common.mk
+include ../make/config.mk
+include ../make/common.mk
 CFLAGS+=-fopenmp
 linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
 # dependenies here
--- a/rabit-learn/linear/README.md
+++ b/rabit-learn/linear/README.md
@@ -2,11 +2,24 @@ Linear and Logistic Regression
 ====
 * input format: LibSVM
 * Local Example: [run-linear.sh](run-linear.sh)
-* Runnig on Hadoop: [run-hadoop.sh](run-hadoop.sh)
-  - Set input data to stdin, and model_out=stdout
-    
+* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
+  - You will need to have YARN 
+  - Modify  ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
+  - Run build.sh on [../../yarn](../../yarn) on to build yarn jar file 
+
+Multi-Threading Optimization
+====
+* The code can be  multi-threaded, we encourage you to use it
+  - Simply add ```nthread=k``` where k is the number of threads you want to use
+* If you submit with YARN 
+  - Use ```--vcores``` and ```-mem``` to request CPU and memory resources
+  - Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
+* Usually multi-threading improves speed in general
+  - You can use less workers and assign more resources to each of worker
+  - This usually means less communication overhead and faster running time
+
 Parameters
-===
+====
 All the parameters can be set by param=value

 #### Important Parameters
--- a/rabit-learn/linear/linear.cc
+++ b/rabit-learn/linear/linear.cc
@@ -1,6 +1,5 @@
 #include "./linear.h"
-#include "../utils/io.h"
-#include "../utils/base64.h"
+#include "../io/io.h"

 namespace rabit {
 namespace linear {
@@ -55,7 +54,9 @@ class LinearObjFunction : public solver::IObjFunction<float> {
    }
    if (task == "train") {
      lbfgs.Run();
-      this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
+      if (rabit::GetRank() == 0) {
+        this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
+      }
    } else if (task == "pred") {
      this->TaskPred();
    } else {
@@ -74,51 +75,37 @@ class LinearObjFunction : public solver::IObjFunction<float> {
    printf("Finishing writing to %s\n", name_pred.c_str());
  }
  inline void LoadModel(const char *fname) {
-    FILE *fp = utils::FopenCheck(fname, "rb");
+    IStream *fi = io::CreateStream(fname, "r");
    std::string header; header.resize(4);
    // check header for different binary encode
    // can be base64 or binary
-    utils::FileStream fi(fp);
-    utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
-      // base64 format
+    utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
+    // base64 format
    if (header == "bs64") {
-      utils::Base64InStream bsin(fp);
+      io::Base64InStream bsin(fi);
      bsin.InitPosition();
      model.Load(bsin);
-      fclose(fp);
-      return;
    } else if (header == "binf") {
-      model.Load(fi);
-      fclose(fp);
-      return;     
+      model.Load(*fi);
    } else {
      utils::Error("invalid model file");
    }
+    delete fi;
  }
  inline void SaveModel(const char *fname,
                        const float *wptr,
                        bool save_base64 = false) {
-    FILE *fp;
-    bool use_stdout = false;
-    if (!strcmp(fname, "stdout")) {
-      fp = stdout;
-      use_stdout = true;
-    } else {
-      fp = utils::FopenCheck(fname, "wb");
-   }
-    utils::FileStream fo(fp);
-    if (save_base64 != 0|| use_stdout) {
-      fo.Write("bs64\t", 5);
-      utils::Base64OutStream bout(fp);
+    IStream *fo = io::CreateStream(fname, "w");
+    if (save_base64 != 0 || !strcmp(fname, "stdout")) {
+      fo->Write("bs64\t", 5);
+      io::Base64OutStream bout(fo);
      model.Save(bout, wptr);
      bout.Finish('\n');
    } else {
-      fo.Write("binf", 4);
-      model.Save(fo, wptr);
-    }
-    if (!use_stdout) {
-      fclose(fp);
+      fo->Write("binf", 4);
+      model.Save(*fo, wptr);
    }
+    delete fo;
  }
  inline void LoadData(const char *fname) {
    dtrain.Load(fname);
--- a/rabit-learn/linear/run-hadoop-old.sh
+++ b/rabit-learn/linear/run-hadoop-old.sh
@@ -12,7 +12,7 @@ hadoop fs -mkdir $2/data
 hadoop fs -put ../data/agaricus.txt.train $2/data

 # submit to hadoop
-../../tracker/rabit_hadoop.py --host_ip ip  -n $1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}" 
+../../tracker/rabit_hadoop_streaming.py  -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}" 

 # get the final model file
 hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
--- a/rabit-learn/linear/run-linear-mock.sh
+++ b/rabit-learn/linear/run-linear-mock.sh
@@ -5,11 +5,7 @@ then
    exit -1
 fi

-rm -rf mushroom.row* *.model
+rm -rf *.model
 k=$1

-# split the lib svm file into k subfiles
-python splitrows.py ../data/agaricus.txt.train mushroom $k
-
-# run xgboost mpi
-../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0  mock=0,2,1,1
+../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train  "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0  mock=0,2,1,1
--- a/rabit-learn/linear/run-linear.sh
+++ b/rabit-learn/linear/run-linear.sh
@@ -5,13 +5,10 @@ then
    exit -1
 fi

-rm -rf mushroom.row* *.model
+rm -rf *.model
 k=$1

-# split the lib svm file into k subfiles
-python splitrows.py ../data/agaricus.txt.train mushroom $k
-
-# run xgboost mpi
-../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
+# run linear model, the program will automatically split the inputs
+../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1 

 ./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model
--- a/rabit-learn/linear/run-yarn.sh
+++ b/rabit-learn/linear/run-yarn.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -lt 3 ];
+then
+	echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
+	exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -rm -r -f $2/data
+hadoop fs -rm -r -f $2/mushroom.linear.model
+hadoop fs -mkdir $2/data
+
+# submit to hadoop
+../../tracker/rabit_yarn.py  -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}" 
+
+# get the final model file
+hadoop fs -get $2/mushroom.linear.model ./linear.model
+
+./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
--- a/rabit-learn/linear/splitrows.py
+++ b/rabit-learn/linear/splitrows.py
@@ -1,24 +0,0 @@
-#!/usr/bin/python
-import sys
-import random
-
-# split libsvm file into different rows
-if len(sys.argv) < 4:
-    print ('Usage:<fin> <fo> k')
-    exit(0)
-
-random.seed(10)
-
-k = int(sys.argv[3])
-fi = open( sys.argv[1], 'r' )
-fos = []
-
-for i in range(k):
-    fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
-    
-for l in open(sys.argv[1]):
-    i = random.randint(0, k-1)
-    fos[i].write(l)
-
-for f in fos:    
-    f.close()