Merge pull request #136 from cblsjtu/unity

hadoop example
2015-01-10 09:31:06 -08:00 · 2015-01-10 09:31:06 -08:00 · d348f83c17
commit d348f83c17
parent 5ad100b5a3 7665dd1ed2
4 changed files with 91 additions and 0 deletions
--- a/demo/binary_classification/mushroom.hadoop.conf
+++ b/demo/binary_classification/mushroom.hadoop.conf
@ -0,0 +1,31 @@
 # General Parameters, see comment for each definition
 # choose the booster, can be gbtree or gblinear
 booster = gbtree
 # choose logistic regression loss function for binary classification
 objective = binary:logistic
 # Tree Booster Parameters
 # step size shrinkage
 eta = 1.0 
 # minimum loss reduction required to make a further partition
 gamma = 1.0 
 # minimum sum of instance weight(hessian) needed in a child
 min_child_weight = 1 
 # maximum depth of a tree
 max_depth = 3 
 # Task Parameters
 # the number of round to do boosting
 num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0 
 # The path of training data
 data = "agaricus.txt.train" 
 # The following parameters are not supported by xgboost running in hadoop yet!
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
 #eval[test] = "agaricus.txt.test" 
 # evaluate on training data as well each round
 #eval_train = 1
 # The path of test data 
 #test:data = "agaricus.txt.test"      
--- a/multi-node/README.md
+++ b/multi-node/README.md
@ -22,6 +22,8 @@ Design Choice
  - Row-based solver split data by row, each node work on subset of rows,
    it uses an approximate histogram count algorithm, and will only examine subset of 
    potential split points as opposed to all split points.
  - Hadoop version can run on the existing hadoop platform,
    it use Rabit to submit jobs as map-reduce tasks.
 Usage
 ====
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@ -0,0 +1,15 @@
 Distributed XGBoost: Hadoop Version
 ====
 * Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
  - This is the hadoop version of binary classification example in the demo folder.
 How to Use
 ====
 * Check whether environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, plz set up hadoop-streaming.jar path in rabit_hadoop.py. 
 Notes
 ====
 * The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
 * The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers. 
 * The hadoop version now can only save the final model and evaluate test data locally after the training process.
--- a/multi-node/hadoop/run_binary_classification.sh
+++ b/multi-node/hadoop/run_binary_classification.sh
@ -0,0 +1,43 @@
 #!/bin/bash
 if [ "$#" -lt 2 ];
 then
    echo "Usage: <nworkers> <path_in_HDFS>"
    exit -1
 fi
 curDir=`pwd`
 dataDir=../../demo/binary_classification
 trainFile=$dataDir/agaricus.txt.train
 input=$2
 output=$2/model
 # generate the training file if it doesnot exist
 if [ ! -f "$trainFile" ];
 then 
  echo "Generating training file:"
  cd $dataDir
  # map feature using indicator encoding, also produce featmap.txt
  python mapfeat.py
  # split train and test
  python mknfold.py agaricus.txt 1
  cd $curDir
 fi
 hadoop fs -mkdir $input
 hadoop fs -put $trainFile $input
 #hadoop fs -rm -skipTrash -r $output
 # training and output the final model file
 python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \
    --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout
 # get the final model file
 hadoop fs -get $output/part-00000 ./final.model
 # output prediction task=pred 
 ../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model
 # print the boosters of 00002.model in dump.raw.txt
 ../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
 # use the feature map in printing for better visualization
 ../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt