chg of hadoop script
This commit is contained in:
parent
15bf8677da
commit
62a108a7c2
@ -1,6 +1,6 @@
|
|||||||
Distributed XGBoost: Hadoop Version
|
Distributed XGBoost: Hadoop Version
|
||||||
====
|
====
|
||||||
* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
|
* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
|
||||||
- This is the hadoop version of binary classification example in the demo folder.
|
- This is the hadoop version of binary classification example in the demo folder.
|
||||||
|
|
||||||
How to Use
|
How to Use
|
||||||
@ -9,7 +9,7 @@ How to Use
|
|||||||
|
|
||||||
Notes
|
Notes
|
||||||
====
|
====
|
||||||
* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
|
* The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN).
|
||||||
* The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers.
|
* The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set <n_thread_per_worker> to be number of cores you have on each machine.
|
||||||
* The hadoop version now can only save the final model and evaluate test data locally after the training process.
|
- You will need YARN to set specify number of cores of each worker
|
||||||
|
* The hadoop version save the final model into HDFS
|
||||||
|
|||||||
@ -1,29 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
if [ "$#" -lt 2 ];
|
|
||||||
then
|
|
||||||
echo "Usage: <num_of_slave_nodes> <path_in_HDFS>"
|
|
||||||
exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# put the local training file to HDFS
|
|
||||||
hadoop fs -mkdir $2/data
|
|
||||||
hadoop fs -put ../../demo/data/agaricus.txt.train $2/data
|
|
||||||
|
|
||||||
# training and output the final model file
|
|
||||||
../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \
|
|
||||||
-o $2/model -f ../../demo/data/agaricus.txt.test \
|
|
||||||
../../xgboost mushroom.hadoop.conf dsplit=row
|
|
||||||
|
|
||||||
# get the final model file
|
|
||||||
hadoop fs -get $2/model/part-00000 ./final.model
|
|
||||||
|
|
||||||
# output prediction task=pred
|
|
||||||
../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
|
|
||||||
test:data=../../demo/data/agaricus.txt.test
|
|
||||||
# print the boosters of final.model in dump.raw.txt
|
|
||||||
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
|
|
||||||
# use the feature map in printing for better visualization
|
|
||||||
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
|
|
||||||
fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
|
|
||||||
cat dump.nice.txt
|
|
||||||
12
multi-node/hadoop/run_yarn_mushroom.sh → multi-node/hadoop/run_mushroom.sh
Normal file → Executable file
12
multi-node/hadoop/run_yarn_mushroom.sh → multi-node/hadoop/run_mushroom.sh
Normal file → Executable file
@ -9,21 +9,15 @@ fi
|
|||||||
hadoop fs -mkdir $3/data
|
hadoop fs -mkdir $3/data
|
||||||
hadoop fs -put ../../demo/data/agaricus.txt.train $3/data
|
hadoop fs -put ../../demo/data/agaricus.txt.train $3/data
|
||||||
|
|
||||||
|
../../rabit/tracker/rabit_hadoop.py -n $1 -nt $2 -i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf nthread=$2
|
||||||
python ../../rabit/tracker/rabit_yarn.py -nw $1 -nt $2 -f ../../demo/data/agaricus.txt.test \
|
|
||||||
-i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf nthread=$2 dsplit=row
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# get the final model file
|
# get the final model file
|
||||||
hadoop fs -get $3/model/part-00000 ./final.model
|
hadoop fs -get $3/model/part-00000 ./final.model
|
||||||
|
|
||||||
# output prediction task=pred
|
# output prediction task=pred
|
||||||
../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
|
../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test
|
||||||
test:data=../../demo/data/agaricus.txt.test
|
|
||||||
# print the boosters of final.model in dump.raw.txt
|
# print the boosters of final.model in dump.raw.txt
|
||||||
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
|
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
|
||||||
# use the feature map in printing for better visualization
|
# use the feature map in printing for better visualization
|
||||||
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
|
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
|
||||||
fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
|
|
||||||
cat dump.nice.txt
|
cat dump.nice.txt
|
||||||
Loading…
x
Reference in New Issue
Block a user