diff --git a/demo/binary_classification/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf similarity index 78% rename from demo/binary_classification/mushroom.hadoop.conf rename to multi-node/hadoop/mushroom.hadoop.conf index 1dffe4f8d..305b82dd3 100644 --- a/demo/binary_classification/mushroom.hadoop.conf +++ b/multi-node/hadoop/mushroom.hadoop.conf @@ -20,12 +20,11 @@ num_round = 2 # 0 means do not save any model except the final round model save_period = 0 # The path of training data -data = "agaricus.txt.train" +data = stdin +# The path of model file +model_out = stdout -# The following parameters are not supported by xgboost running in hadoop yet! # The path of validation data, used to monitor training process, here [test] sets name of the validation set -#eval[test] = "agaricus.txt.test" +eval[test] = "agaricus.txt.test" # evaluate on training data as well each round -#eval_train = 1 -# The path of test data -#test:data = "agaricus.txt.test" +eval_train = 1 diff --git a/multi-node/hadoop/run_binary_classification.sh b/multi-node/hadoop/run_binary_classification.sh deleted file mode 100755 index 740a468cf..000000000 --- a/multi-node/hadoop/run_binary_classification.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -if [ "$#" -lt 2 ]; -then - echo "Usage: " - exit -1 -fi - -curDir=`pwd` -dataDir=../../demo/binary_classification -trainFile=$dataDir/agaricus.txt.train -input=$2 -output=$2/model - -# generate the training file if it doesnot exist -if [ ! -f "$trainFile" ]; -then - echo "Generating training file:" - cd $dataDir - # map feature using indicator encoding, also produce featmap.txt - python mapfeat.py - # split train and test - python mknfold.py agaricus.txt 1 - cd $curDir -fi - -hadoop fs -mkdir $input -hadoop fs -put $trainFile $input -#hadoop fs -rm -skipTrash -r $output - -# training and output the final model file -python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \ - --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout - -# get the final model file -hadoop fs -get $output/part-00000 ./final.model -# output prediction task=pred -../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model -# print the boosters of 00002.model in dump.raw.txt -../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt -# use the feature map in printing for better visualization -../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt -cat dump.nice.txt diff --git a/multi-node/hadoop/run_hadoop_mushroom.sh b/multi-node/hadoop/run_hadoop_mushroom.sh new file mode 100755 index 000000000..2f095ff25 --- /dev/null +++ b/multi-node/hadoop/run_hadoop_mushroom.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +if [ "$#" -lt 2 ]; +then + echo "Usage: " + exit -1 +fi + +# put the local training file to HDFS +hadoop fs -mkdir $2/data +hadoop fs -put ../../demo/data/agaricus.txt.train $2/data + +# training and output the final model file +../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \ + -o $2/model -f ../../demo/data/agaricus.txt.test \ + ../../xgboost mushroom.hadoop.conf dsplit=row + +# get the final model file +hadoop fs -get $2/model/part-00000 ./final.model + +# output prediction task=pred +../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \ + test:data=../../demo/data/agaricus.txt.test +# print the boosters of final.model in dump.raw.txt +../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt +# use the feature map in printing for better visualization +../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \ + fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt +cat dump.nice.txt