chg of hadoop script

2015-01-11 21:02:38 -08:00
parent 15bf8677da
commit 62a108a7c2
3 changed files with 8 additions and 43 deletions
--- a/multi-node/hadoop/README.md
+++ b/multi-node/hadoop/README.md
@@ -1,6 +1,6 @@
 Distributed XGBoost: Hadoop Version
 ====
-* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
+* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
  - This is the hadoop version of binary classification example in the demo folder.

 How to Use
@@ -9,7 +9,7 @@ How to Use

 Notes
 ====
-* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
-* The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers. 
-* The hadoop version now can only save the final model and evaluate test data locally after the training process.
-
+* The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN).
+* The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set <n_thread_per_worker> to be number of cores you have on each machine.
+  - You will need YARN to set specify number of cores of each worker
+* The hadoop version save the final model into HDFS
--- a/multi-node/hadoop/run_hadoop_mushroom.sh
+++ b/multi-node/hadoop/run_hadoop_mushroom.sh
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -lt 2 ];
-then
-    echo "Usage: <num_of_slave_nodes> <path_in_HDFS>"
-    exit -1
-fi
-
-# put the local training file to HDFS
-hadoop fs -mkdir $2/data
-hadoop fs -put ../../demo/data/agaricus.txt.train $2/data
-
-# training and output the final model file
-../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \
-    -o $2/model -f ../../demo/data/agaricus.txt.test \
-    ../../xgboost mushroom.hadoop.conf dsplit=row 
-
-# get the final model file
-hadoop fs -get $2/model/part-00000 ./final.model
-
-# output prediction task=pred 
-../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
-    test:data=../../demo/data/agaricus.txt.test
-# print the boosters of final.model in dump.raw.txt
-../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
-# use the feature map in printing for better visualization
-../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
-    fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
-cat dump.nice.txt
--- a/multi-node/hadoop/run_yarn_mushroom.sh
+++ b/multi-node/hadoop/run_yarn_mushroom.sh
@@ -9,21 +9,15 @@ fi
 hadoop fs -mkdir $3/data
 hadoop fs -put ../../demo/data/agaricus.txt.train $3/data

-
-python ../../rabit/tracker/rabit_yarn.py  -nw $1 -nt $2 -f ../../demo/data/agaricus.txt.test \
-	-i $3/data/agaricus.txt.train -o $3/model  ../../xgboost mushroom.hadoop.conf  nthread=$2 dsplit=row
-
-
+../../rabit/tracker/rabit_hadoop.py  -n $1 -nt $2 -i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf  nthread=$2

 # get the final model file
 hadoop fs -get $3/model/part-00000 ./final.model

 # output prediction task=pred 
-../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
-    test:data=../../demo/data/agaricus.txt.test
+../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test
 # print the boosters of final.model in dump.raw.txt
 ../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
 # use the feature map in printing for better visualization
-../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
-fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt