Merge commit '57b5d7873f4f0953357e9d98e9c60cff8373d7ec'
This commit is contained in:
2
subtree/rabit/rabit-learn/linear/.gitignore
vendored
Normal file
2
subtree/rabit/rabit-learn/linear/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
mushroom.row*
|
||||
*.model
|
||||
@@ -6,7 +6,8 @@ MPIBIN =
|
||||
OBJ = linear.o
|
||||
|
||||
# common build script for programs
|
||||
include ../common.mk
|
||||
include ../make/config.mk
|
||||
include ../make/common.mk
|
||||
CFLAGS+=-fopenmp
|
||||
linear.o: linear.cc ../../src/*.h linear.h ../solver/*.h
|
||||
# dependenies here
|
||||
|
||||
@@ -2,11 +2,24 @@ Linear and Logistic Regression
|
||||
====
|
||||
* input format: LibSVM
|
||||
* Local Example: [run-linear.sh](run-linear.sh)
|
||||
* Runnig on Hadoop: [run-hadoop.sh](run-hadoop.sh)
|
||||
- Set input data to stdin, and model_out=stdout
|
||||
|
||||
* Runnig on YARN: [run-yarn.sh](run-yarn.sh)
|
||||
- You will need to have YARN
|
||||
- Modify ```../make/config.mk``` to set USE_HDFS=1 to compile with HDFS support
|
||||
- Run build.sh on [../../yarn](../../yarn) on to build yarn jar file
|
||||
|
||||
Multi-Threading Optimization
|
||||
====
|
||||
* The code can be multi-threaded, we encourage you to use it
|
||||
- Simply add ```nthread=k``` where k is the number of threads you want to use
|
||||
* If you submit with YARN
|
||||
- Use ```--vcores``` and ```-mem``` to request CPU and memory resources
|
||||
- Some scheduler in YARN do not honor CPU request, you can request more memory to grab working slots
|
||||
* Usually multi-threading improves speed in general
|
||||
- You can use less workers and assign more resources to each of worker
|
||||
- This usually means less communication overhead and faster running time
|
||||
|
||||
Parameters
|
||||
===
|
||||
====
|
||||
All the parameters can be set by param=value
|
||||
|
||||
#### Important Parameters
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "./linear.h"
|
||||
#include "../utils/io.h"
|
||||
#include "../utils/base64.h"
|
||||
#include "../io/io.h"
|
||||
|
||||
namespace rabit {
|
||||
namespace linear {
|
||||
@@ -55,7 +54,9 @@ class LinearObjFunction : public solver::IObjFunction<float> {
|
||||
}
|
||||
if (task == "train") {
|
||||
lbfgs.Run();
|
||||
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
|
||||
if (rabit::GetRank() == 0) {
|
||||
this->SaveModel(model_out.c_str(), lbfgs.GetWeight());
|
||||
}
|
||||
} else if (task == "pred") {
|
||||
this->TaskPred();
|
||||
} else {
|
||||
@@ -74,51 +75,37 @@ class LinearObjFunction : public solver::IObjFunction<float> {
|
||||
printf("Finishing writing to %s\n", name_pred.c_str());
|
||||
}
|
||||
inline void LoadModel(const char *fname) {
|
||||
FILE *fp = utils::FopenCheck(fname, "rb");
|
||||
IStream *fi = io::CreateStream(fname, "r");
|
||||
std::string header; header.resize(4);
|
||||
// check header for different binary encode
|
||||
// can be base64 or binary
|
||||
utils::FileStream fi(fp);
|
||||
utils::Check(fi.Read(&header[0], 4) != 0, "invalid model");
|
||||
// base64 format
|
||||
utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
|
||||
// base64 format
|
||||
if (header == "bs64") {
|
||||
utils::Base64InStream bsin(fp);
|
||||
io::Base64InStream bsin(fi);
|
||||
bsin.InitPosition();
|
||||
model.Load(bsin);
|
||||
fclose(fp);
|
||||
return;
|
||||
} else if (header == "binf") {
|
||||
model.Load(fi);
|
||||
fclose(fp);
|
||||
return;
|
||||
model.Load(*fi);
|
||||
} else {
|
||||
utils::Error("invalid model file");
|
||||
}
|
||||
delete fi;
|
||||
}
|
||||
inline void SaveModel(const char *fname,
|
||||
const float *wptr,
|
||||
bool save_base64 = false) {
|
||||
FILE *fp;
|
||||
bool use_stdout = false;
|
||||
if (!strcmp(fname, "stdout")) {
|
||||
fp = stdout;
|
||||
use_stdout = true;
|
||||
} else {
|
||||
fp = utils::FopenCheck(fname, "wb");
|
||||
}
|
||||
utils::FileStream fo(fp);
|
||||
if (save_base64 != 0|| use_stdout) {
|
||||
fo.Write("bs64\t", 5);
|
||||
utils::Base64OutStream bout(fp);
|
||||
IStream *fo = io::CreateStream(fname, "w");
|
||||
if (save_base64 != 0 || !strcmp(fname, "stdout")) {
|
||||
fo->Write("bs64\t", 5);
|
||||
io::Base64OutStream bout(fo);
|
||||
model.Save(bout, wptr);
|
||||
bout.Finish('\n');
|
||||
} else {
|
||||
fo.Write("binf", 4);
|
||||
model.Save(fo, wptr);
|
||||
}
|
||||
if (!use_stdout) {
|
||||
fclose(fp);
|
||||
fo->Write("binf", 4);
|
||||
model.Save(*fo, wptr);
|
||||
}
|
||||
delete fo;
|
||||
}
|
||||
inline void LoadData(const char *fname) {
|
||||
dtrain.Load(fname);
|
||||
|
||||
@@ -12,7 +12,7 @@ hadoop fs -mkdir $2/data
|
||||
hadoop fs -put ../data/agaricus.txt.train $2/data
|
||||
|
||||
# submit to hadoop
|
||||
../../tracker/rabit_hadoop.py --host_ip ip -n $1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
|
||||
../../tracker/rabit_hadoop_streaming.py -n $1 --vcores 1 -i $2/data/agaricus.txt.train -o $2/mushroom.linear.model linear.rabit stdin model_out=stdout "${*:3}"
|
||||
|
||||
# get the final model file
|
||||
hadoop fs -get $2/mushroom.linear.model/part-00000 ./linear.model
|
||||
@@ -5,11 +5,7 @@ then
|
||||
exit -1
|
||||
fi
|
||||
|
||||
rm -rf mushroom.row* *.model
|
||||
rm -rf *.model
|
||||
k=$1
|
||||
|
||||
# split the lib svm file into k subfiles
|
||||
python splitrows.py ../data/agaricus.txt.train mushroom $k
|
||||
|
||||
# run xgboost mpi
|
||||
../../tracker/rabit_demo.py -n $k linear.mock mushroom.row\%d "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1
|
||||
../../tracker/rabit_demo.py -n $k linear.mock ../data/agaricus.txt.train "${*:2}" reg_L1=1 mock=0,1,1,0 mock=1,1,1,0 mock=0,2,1,1
|
||||
|
||||
@@ -5,13 +5,10 @@ then
|
||||
exit -1
|
||||
fi
|
||||
|
||||
rm -rf mushroom.row* *.model
|
||||
rm -rf *.model
|
||||
k=$1
|
||||
|
||||
# split the lib svm file into k subfiles
|
||||
python splitrows.py ../data/agaricus.txt.train mushroom $k
|
||||
|
||||
# run xgboost mpi
|
||||
../../tracker/rabit_demo.py -n $k linear.rabit mushroom.row\%d "${*:2}" reg_L1=1
|
||||
# run linear model, the program will automatically split the inputs
|
||||
../../tracker/rabit_demo.py -n $k linear.rabit ../data/agaricus.txt.train reg_L1=1
|
||||
|
||||
./linear.rabit ../data/agaricus.txt.test task=pred model_in=final.model
|
||||
|
||||
19
subtree/rabit/rabit-learn/linear/run-yarn.sh
Executable file
19
subtree/rabit/rabit-learn/linear/run-yarn.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
if [ "$#" -lt 3 ];
|
||||
then
|
||||
echo "Usage: <nworkers> <path_in_HDFS> [param=val]"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
# put the local training file to HDFS
|
||||
hadoop fs -rm -r -f $2/data
|
||||
hadoop fs -rm -r -f $2/mushroom.linear.model
|
||||
hadoop fs -mkdir $2/data
|
||||
|
||||
# submit to hadoop
|
||||
../../tracker/rabit_yarn.py -n $1 --vcores 1 linear.rabit hdfs://$2/data/agaricus.txt.train model_out=hdfs://$2/mushroom.linear.model "${*:3}"
|
||||
|
||||
# get the final model file
|
||||
hadoop fs -get $2/mushroom.linear.model ./linear.model
|
||||
|
||||
./linear.rabit ../data/agaricus.txt.test task=pred model_in=linear.model
|
||||
@@ -1,24 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import random
|
||||
|
||||
# split libsvm file into different rows
|
||||
if len(sys.argv) < 4:
|
||||
print ('Usage:<fin> <fo> k')
|
||||
exit(0)
|
||||
|
||||
random.seed(10)
|
||||
|
||||
k = int(sys.argv[3])
|
||||
fi = open( sys.argv[1], 'r' )
|
||||
fos = []
|
||||
|
||||
for i in range(k):
|
||||
fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
|
||||
|
||||
for l in open(sys.argv[1]):
|
||||
i = random.randint(0, k-1)
|
||||
fos[i].write(l)
|
||||
|
||||
for f in fos:
|
||||
f.close()
|
||||
Reference in New Issue
Block a user