Merge commit '57b5d7873f4f0953357e9d98e9c60cff8373d7ec'

This commit is contained in:
tqchen
2015-03-09 13:28:38 -07:00
43 changed files with 1797 additions and 235 deletions

View File

@@ -0,0 +1,2 @@
mushroom.row*
*.model

View File

@@ -6,7 +6,8 @@ MPIBIN =
OBJ = linear.o
# common build script for programs
include ../common.mk
include ../make/config.mk
include ../make/common.mk
CFLAGS+=-fopenmp
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
# dependenies here

View File

@@ -2,11 +2,24 @@ Linear and Logistic Regression
====
* input format: LibSVM
* Local Example: [run-linear.sh](run-linear.sh)
* Runnig on Hadoop: [run-hadoop.sh](run-hadoop.sh)
- Set input data to stdin, and model_out=stdout
* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
- You will need to have YARN
- Modify ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
- Run build.sh on [../../yarn](../../yarn) on to build yarn jar file
Multi-Threading Optimization
====
* The code can be multi-threaded, we encourage you to use it
- Simply add ```nthread=k``` where k is the number of threads you want to use
* If you submit with YARN
- Use ```--vcores``` and ```-mem``` to request CPU and memory resources
- Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
* Usually multi-threading improves speed in general
- You can use less workers and assign more resources to each of worker
- This usually means less communication overhead and faster running time
Parameters
===
====
All the parameters can be set by param=value
#### Important Parameters

View File

@@ -1,6 +1,5 @@
#include "./linear.h"
#include "../utils/io.h"
#include "../utils/base64.h"
#include "../io/io.h"
namespace rabit {
namespace linear {
@@ -55,7 +54,9 @@ class LinearObjFunction : public solver::IObjFunction<float> {
}
if (task == "train") {
lbfgs.Run();
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
if (rabit::GetRank() == 0) {
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
}
} else if (task == "pred") {
this->TaskPred();
} else {
@@ -74,51 +75,37 @@ class LinearObjFunction : public solver::IObjFunction<float> {
printf("Finishing writing to %s\n", name_pred.c_str());
}
inline void LoadModel(const char *fname) {
FILE *fp = utils::FopenCheck(fname, "rb");
IStream *fi = io::CreateStream(fname, "r");
std::string header; header.resize(4);
// check header for different binary encode
// can be base64 or binary
utils::FileStream fi(fp);
utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
// base64 format
utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
// base64 format
if (header == "bs64") {
utils::Base64InStream bsin(fp);
io::Base64InStream bsin(fi);
bsin.InitPosition();
model.Load(bsin);
fclose(fp);
return;
} else if (header == "binf") {
model.Load(fi);
fclose(fp);
return;
model.Load(*fi);
} else {
utils::Error("invalid model file");
}
delete fi;
}
inline void SaveModel(const char *fname,
const float *wptr,
bool save_base64 = false) {
FILE *fp;
bool use_stdout = false;
if (!strcmp(fname, "stdout")) {
fp = stdout;
use_stdout = true;
} else {
fp = utils::FopenCheck(fname, "wb");
}
utils::FileStream fo(fp);
if (save_base64 != 0|| use_stdout) {
fo.Write("bs64\t", 5);
utils::Base64OutStream bout(fp);
IStream *fo = io::CreateStream(fname, "w");
if (save_base64 != 0 || !strcmp(fname, "stdout")) {
fo->Write("bs64\t", 5);
io::Base64OutStream bout(fo);
model.Save(bout, wptr);
bout.Finish('\n');
} else {
fo.Write("binf", 4);
model.Save(fo, wptr);
}
if (!use_stdout) {
fclose(fp);
fo->Write("binf", 4);
model.Save(*fo, wptr);
}
delete fo;
}
inline void LoadData(const char *fname) {
dtrain.Load(fname);

View File

@@ -12,7 +12,7 @@ hadoop fs -mkdir $2/data
hadoop fs -put ../data/agaricus.txt.train $2/data
# submit to hadoop
../../tracker/rabit_hadoop.py --host_ip ip -n $1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
# get the final model file
hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model

View File

@@ -5,11 +5,7 @@ then
exit -1
fi
rm -rf mushroom.row* *.model
rm -rf *.model
k=$1
# split the lib svm file into k subfiles
python splitrows.py ../data/agaricus.txt.train mushroom $k
# run xgboost mpi
../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1
../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1

View File

@@ -5,13 +5,10 @@ then
exit -1
fi
rm -rf mushroom.row* *.model
rm -rf *.model
k=$1
# split the lib svm file into k subfiles
python splitrows.py ../data/agaricus.txt.train mushroom $k
# run xgboost mpi
../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
# run linear model, the program will automatically split the inputs
../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model

View File

@@ -0,0 +1,19 @@
#!/bin/bash
if [ "$#" -lt 3 ];
then
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
exit -1
fi
# put the local training file to HDFS
hadoop fs -rm -r -f $2/data
hadoop fs -rm -r -f $2/mushroom.linear.model
hadoop fs -mkdir $2/data
# submit to hadoop
../../tracker/rabit_yarn.py -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
# get the final model file
hadoop fs -get $2/mushroom.linear.model ./linear.model
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model

View File

@@ -1,24 +0,0 @@
#!/usr/bin/python
import sys
import random
# split libsvm file into different rows
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)
random.seed(10)
k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []
for i in range(k):
fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
for l in open(sys.argv[1]):
i = random.randint(0, k-1)
fos[i].write(l)
for f in fos:
f.close()