Merge pull request #137 from cblsjtu/unity

Unity hadoop version scripts
2015-01-10 23:47:52 -08:00
parent 69e079941e ceabf5755f
commit c38f7109bd
3 changed files with 34 additions and 49 deletions
--- a/demo/binary_classification/mushroom.hadoop.conf
+++ b/demo/binary_classification/mushroom.hadoop.conf
@@ -20,12 +20,11 @@ num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0 
 # The path of training data
-data = "agaricus.txt.train" 
+data = stdin
+# The path of model file
+model_out = stdout 

-# The following parameters are not supported by xgboost running in hadoop yet!
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
-#eval[test] = "agaricus.txt.test" 
+eval[test] = "agaricus.txt.test" 
 # evaluate on training data as well each round
-#eval_train = 1
-# The path of test data 
-#test:data = "agaricus.txt.test"      
+eval_train = 1
--- a/multi-node/hadoop/run_binary_classification.sh
+++ b/multi-node/hadoop/run_binary_classification.sh
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -lt 2 ];
-then
-    echo "Usage: <nworkers> <path_in_HDFS>"
-    exit -1
-fi
-
-curDir=`pwd`
-dataDir=../../demo/binary_classification
-trainFile=$dataDir/agaricus.txt.train
-input=$2
-output=$2/model
-
-# generate the training file if it doesnot exist
-if [ ! -f "$trainFile" ];
-then 
-  echo "Generating training file:"
-  cd $dataDir
-  # map feature using indicator encoding, also produce featmap.txt
-  python mapfeat.py
-  # split train and test
-  python mknfold.py agaricus.txt 1
-  cd $curDir
-fi
-
-hadoop fs -mkdir $input
-hadoop fs -put $trainFile $input
-#hadoop fs -rm -skipTrash -r $output
-
-# training and output the final model file
-python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \
-    --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout
-
-# get the final model file
-hadoop fs -get $output/part-00000 ./final.model
-# output prediction task=pred 
-../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model
-# print the boosters of 00002.model in dump.raw.txt
-../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
-# use the feature map in printing for better visualization
-../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt
-cat dump.nice.txt
--- a/multi-node/hadoop/run_hadoop_mushroom.sh
+++ b/multi-node/hadoop/run_hadoop_mushroom.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ "$#" -lt 2 ];
+then
+    echo "Usage: <num_of_slave_nodes> <path_in_HDFS>"
+    exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -mkdir $2/data
+hadoop fs -put ../../demo/data/agaricus.txt.train $2/data
+
+# training and output the final model file
+../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \
+    -o $2/model -f ../../demo/data/agaricus.txt.test \
+    ../../xgboost mushroom.hadoop.conf dsplit=row 
+
+# get the final model file
+hadoop fs -get $2/model/part-00000 ./final.model
+
+# output prediction task=pred 
+../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
+    test:data=../../demo/data/agaricus.txt.test
+# print the boosters of final.model in dump.raw.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
+    fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt