commit
d348f83c17
31
demo/binary_classification/mushroom.hadoop.conf
Normal file
31
demo/binary_classification/mushroom.hadoop.conf
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# General Parameters, see comment for each definition
|
||||||
|
# choose the booster, can be gbtree or gblinear
|
||||||
|
booster = gbtree
|
||||||
|
# choose logistic regression loss function for binary classification
|
||||||
|
objective = binary:logistic
|
||||||
|
|
||||||
|
# Tree Booster Parameters
|
||||||
|
# step size shrinkage
|
||||||
|
eta = 1.0
|
||||||
|
# minimum loss reduction required to make a further partition
|
||||||
|
gamma = 1.0
|
||||||
|
# minimum sum of instance weight(hessian) needed in a child
|
||||||
|
min_child_weight = 1
|
||||||
|
# maximum depth of a tree
|
||||||
|
max_depth = 3
|
||||||
|
|
||||||
|
# Task Parameters
|
||||||
|
# the number of round to do boosting
|
||||||
|
num_round = 2
|
||||||
|
# 0 means do not save any model except the final round model
|
||||||
|
save_period = 0
|
||||||
|
# The path of training data
|
||||||
|
data = "agaricus.txt.train"
|
||||||
|
|
||||||
|
# The following parameters are not supported by xgboost running in hadoop yet!
|
||||||
|
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||||
|
#eval[test] = "agaricus.txt.test"
|
||||||
|
# evaluate on training data as well each round
|
||||||
|
#eval_train = 1
|
||||||
|
# The path of test data
|
||||||
|
#test:data = "agaricus.txt.test"
|
||||||
@ -22,6 +22,8 @@ Design Choice
|
|||||||
- Row-based solver split data by row, each node work on subset of rows,
|
- Row-based solver split data by row, each node work on subset of rows,
|
||||||
it uses an approximate histogram count algorithm, and will only examine subset of
|
it uses an approximate histogram count algorithm, and will only examine subset of
|
||||||
potential split points as opposed to all split points.
|
potential split points as opposed to all split points.
|
||||||
|
- Hadoop version can run on the existing hadoop platform,
|
||||||
|
it use Rabit to submit jobs as map-reduce tasks.
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
====
|
====
|
||||||
|
|||||||
15
multi-node/hadoop/README.md
Normal file
15
multi-node/hadoop/README.md
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
Distributed XGBoost: Hadoop Version
|
||||||
|
====
|
||||||
|
* Hadoop version: run ```bash run_binary_classification.sh <n_hadoop_workers> <path_in_HDFS>```
|
||||||
|
- This is the hadoop version of binary classification example in the demo folder.
|
||||||
|
|
||||||
|
How to Use
|
||||||
|
====
|
||||||
|
* Check whether environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, plz set up hadoop-streaming.jar path in rabit_hadoop.py.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
====
|
||||||
|
* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN).
|
||||||
|
* The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter <n_workers> should be less than the number of slaves/workers.
|
||||||
|
* The hadoop version now can only save the final model and evaluate test data locally after the training process.
|
||||||
|
|
||||||
43
multi-node/hadoop/run_binary_classification.sh
Executable file
43
multi-node/hadoop/run_binary_classification.sh
Executable file
@ -0,0 +1,43 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ "$#" -lt 2 ];
|
||||||
|
then
|
||||||
|
echo "Usage: <nworkers> <path_in_HDFS>"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
curDir=`pwd`
|
||||||
|
dataDir=../../demo/binary_classification
|
||||||
|
trainFile=$dataDir/agaricus.txt.train
|
||||||
|
input=$2
|
||||||
|
output=$2/model
|
||||||
|
|
||||||
|
# generate the training file if it doesnot exist
|
||||||
|
if [ ! -f "$trainFile" ];
|
||||||
|
then
|
||||||
|
echo "Generating training file:"
|
||||||
|
cd $dataDir
|
||||||
|
# map feature using indicator encoding, also produce featmap.txt
|
||||||
|
python mapfeat.py
|
||||||
|
# split train and test
|
||||||
|
python mknfold.py agaricus.txt 1
|
||||||
|
cd $curDir
|
||||||
|
fi
|
||||||
|
|
||||||
|
hadoop fs -mkdir $input
|
||||||
|
hadoop fs -put $trainFile $input
|
||||||
|
#hadoop fs -rm -skipTrash -r $output
|
||||||
|
|
||||||
|
# training and output the final model file
|
||||||
|
python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \
|
||||||
|
--jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout
|
||||||
|
|
||||||
|
# get the final model file
|
||||||
|
hadoop fs -get $output/part-00000 ./final.model
|
||||||
|
# output prediction task=pred
|
||||||
|
../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model
|
||||||
|
# print the boosters of 00002.model in dump.raw.txt
|
||||||
|
../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
|
||||||
|
# use the feature map in printing for better visualization
|
||||||
|
../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt
|
||||||
|
cat dump.nice.txt
|
||||||
Loading…
x
Reference in New Issue
Block a user