diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md index adfacdb8b..a3411fee4 100644 --- a/multi-node/hadoop/README.md +++ b/multi-node/hadoop/README.md @@ -1,6 +1,6 @@ Distributed XGBoost: Hadoop Version ==== -* Hadoop version: run ```bash run_binary_classification.sh ``` +* Hadoop version: run ```bash run_binary_classification.sh ``` - This is the hadoop version of binary classification example in the demo folder. How to Use @@ -9,7 +9,7 @@ How to Use Notes ==== -* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN). -* The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter should be less than the number of slaves/workers. -* The hadoop version now can only save the final model and evaluate test data locally after the training process. - +* The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN). +* The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set to be number of cores you have on each machine. + - You will need YARN to set specify number of cores of each worker +* The hadoop version save the final model into HDFS diff --git a/multi-node/hadoop/run_hadoop_mushroom.sh b/multi-node/hadoop/run_hadoop_mushroom.sh deleted file mode 100755 index 2f095ff25..000000000 --- a/multi-node/hadoop/run_hadoop_mushroom.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -if [ "$#" -lt 2 ]; -then - echo "Usage: " - exit -1 -fi - -# put the local training file to HDFS -hadoop fs -mkdir $2/data -hadoop fs -put ../../demo/data/agaricus.txt.train $2/data - -# training and output the final model file -../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \ - -o $2/model -f ../../demo/data/agaricus.txt.test \ - ../../xgboost mushroom.hadoop.conf dsplit=row - -# get the final model file -hadoop fs -get $2/model/part-00000 ./final.model - -# output prediction task=pred -../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \ - test:data=../../demo/data/agaricus.txt.test -# print the boosters of final.model in dump.raw.txt -../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt -# use the feature map in printing for better visualization -../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \ - fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt -cat dump.nice.txt diff --git a/multi-node/hadoop/run_yarn_mushroom.sh b/multi-node/hadoop/run_mushroom.sh old mode 100644 new mode 100755 similarity index 68% rename from multi-node/hadoop/run_yarn_mushroom.sh rename to multi-node/hadoop/run_mushroom.sh index 07ac291d1..1e647047f --- a/multi-node/hadoop/run_yarn_mushroom.sh +++ b/multi-node/hadoop/run_mushroom.sh @@ -9,21 +9,15 @@ fi hadoop fs -mkdir $3/data hadoop fs -put ../../demo/data/agaricus.txt.train $3/data - -python ../../rabit/tracker/rabit_yarn.py -nw $1 -nt $2 -f ../../demo/data/agaricus.txt.test \ - -i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf nthread=$2 dsplit=row - - +../../rabit/tracker/rabit_hadoop.py -n $1 -nt $2 -i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf nthread=$2 # get the final model file hadoop fs -get $3/model/part-00000 ./final.model # output prediction task=pred -../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \ - test:data=../../demo/data/agaricus.txt.test +../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test # print the boosters of final.model in dump.raw.txt ../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt # use the feature map in printing for better visualization -../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \ -fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt +../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt cat dump.nice.txt