From f82732a3629221674a3729e5f6a9a35dd928f01c Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Tue, 6 Jan 2015 17:09:15 +0800 Subject: [PATCH 1/6] add hadoop folder --- multi-node/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/multi-node/README.md b/multi-node/README.md index 31067af5d..02d6fc820 100644 --- a/multi-node/README.md +++ b/multi-node/README.md @@ -22,6 +22,8 @@ Design Choice - Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm, and will only examine subset of potential split points as opposed to all split points. + - Hadoop version can run on the existing hadoop platform, + it use Rabit to submit jobs as map-reduce tasks. Usage ==== From e20d4f43870b092fcc5ab1e85f8e48fbb54baa95 Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sat, 10 Jan 2015 12:26:43 +0800 Subject: [PATCH 2/6] comment some parameters not supported by hadoop version of xgboost --- .../mushroom.hadoop.conf | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 demo/binary_classification/mushroom.hadoop.conf diff --git a/demo/binary_classification/mushroom.hadoop.conf b/demo/binary_classification/mushroom.hadoop.conf new file mode 100644 index 000000000..1dffe4f8d --- /dev/null +++ b/demo/binary_classification/mushroom.hadoop.conf @@ -0,0 +1,31 @@ +# General Parameters, see comment for each definition +# choose the booster, can be gbtree or gblinear +booster = gbtree +# choose logistic regression loss function for binary classification +objective = binary:logistic + +# Tree Booster Parameters +# step size shrinkage +eta = 1.0 +# minimum loss reduction required to make a further partition +gamma = 1.0 +# minimum sum of instance weight(hessian) needed in a child +min_child_weight = 1 +# maximum depth of a tree +max_depth = 3 + +# Task Parameters +# the number of round to do boosting +num_round = 2 +# 0 means do not save any model except the final round model +save_period = 0 +# The path of training data +data = "agaricus.txt.train" + +# The following parameters are not supported by xgboost running in hadoop yet! +# The path of validation data, used to monitor training process, here [test] sets name of the validation set +#eval[test] = "agaricus.txt.test" +# evaluate on training data as well each round +#eval_train = 1 +# The path of test data +#test:data = "agaricus.txt.test" From 61a43111a7d3cf533264084152105c7240aa6867 Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sat, 10 Jan 2015 12:30:00 +0800 Subject: [PATCH 3/6] hadoop version of xgboost binary classification script --- .../run_binary_classification_on_hadoop.sh | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100755 multi-node/hadoop/run_binary_classification_on_hadoop.sh diff --git a/multi-node/hadoop/run_binary_classification_on_hadoop.sh b/multi-node/hadoop/run_binary_classification_on_hadoop.sh new file mode 100755 index 000000000..c194fcea4 --- /dev/null +++ b/multi-node/hadoop/run_binary_classification_on_hadoop.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +if [ "$#" -lt 2 ]; +then + echo "Usage: " + exit -1 +fi + +curDir=`pwd` +dataDir=../../demo/binary_classification +trainFile=$dataDir/agaricus.txt.train +input=$2 +output=$2/model + +# generate the training file if it doesnot exist +if [ ! -f "$trainFile" ]; +then + echo "Generating training file:" + cd $dataDir + # map feature using indicator encoding, also produce featmap.txt + python mapfeat.py + # split train and test + python mknfold.py agaricus.txt 1 + cd $curDir +fi + +hadoop fs -mkdir $input +hadoop fs -put $trainFile $input +#hadoop fs -rm -skipTrash -r $output + +# training and output the models +python ../../rabit/tracker/rabit_hadoop.py -n 3 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \ + --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf dsplit=row num_round=3 data=stdin model_out=stdout + +# get the final model file +hadoop fs -get $output/part-00000 ./final.model +# output prediction task=pred +../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=./final.model +# print the boosters of 00002.model in dump.raw.txt +../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt +# use the feature map in printing for better visualization +../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt +cat dump.nice.txt From 24f99220cbd7955f51e047e2feff030d5e7423a3 Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sat, 10 Jan 2015 23:59:25 +0800 Subject: [PATCH 4/6] fix bugs --- .../hadoop/run_binary_classification.sh | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100755 multi-node/hadoop/run_binary_classification.sh diff --git a/multi-node/hadoop/run_binary_classification.sh b/multi-node/hadoop/run_binary_classification.sh new file mode 100755 index 000000000..740a468cf --- /dev/null +++ b/multi-node/hadoop/run_binary_classification.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +if [ "$#" -lt 2 ]; +then + echo "Usage: " + exit -1 +fi + +curDir=`pwd` +dataDir=../../demo/binary_classification +trainFile=$dataDir/agaricus.txt.train +input=$2 +output=$2/model + +# generate the training file if it doesnot exist +if [ ! -f "$trainFile" ]; +then + echo "Generating training file:" + cd $dataDir + # map feature using indicator encoding, also produce featmap.txt + python mapfeat.py + # split train and test + python mknfold.py agaricus.txt 1 + cd $curDir +fi + +hadoop fs -mkdir $input +hadoop fs -put $trainFile $input +#hadoop fs -rm -skipTrash -r $output + +# training and output the final model file +python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \ + --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout + +# get the final model file +hadoop fs -get $output/part-00000 ./final.model +# output prediction task=pred +../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model +# print the boosters of 00002.model in dump.raw.txt +../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt +# use the feature map in printing for better visualization +../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt +cat dump.nice.txt From 74348c8001e8eab706fae8c91f21ac8a9e022e92 Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sun, 11 Jan 2015 00:00:03 +0800 Subject: [PATCH 5/6] initialize --- multi-node/hadoop/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 multi-node/hadoop/README.md diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md new file mode 100644 index 000000000..adfacdb8b --- /dev/null +++ b/multi-node/hadoop/README.md @@ -0,0 +1,15 @@ +Distributed XGBoost: Hadoop Version +==== +* Hadoop version: run ```bash run_binary_classification.sh ``` + - This is the hadoop version of binary classification example in the demo folder. + +How to Use +==== +* Check whether environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, plz set up hadoop-streaming.jar path in rabit_hadoop.py. + +Notes +==== +* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN). +* The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter should be less than the number of slaves/workers. +* The hadoop version now can only save the final model and evaluate test data locally after the training process. + From 7665dd1ed26abf246caaeb820e6ae83a42546ceb Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sun, 11 Jan 2015 00:04:47 +0800 Subject: [PATCH 6/6] rename --- .../run_binary_classification_on_hadoop.sh | 43 ------------------- 1 file changed, 43 deletions(-) delete mode 100755 multi-node/hadoop/run_binary_classification_on_hadoop.sh diff --git a/multi-node/hadoop/run_binary_classification_on_hadoop.sh b/multi-node/hadoop/run_binary_classification_on_hadoop.sh deleted file mode 100755 index c194fcea4..000000000 --- a/multi-node/hadoop/run_binary_classification_on_hadoop.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -if [ "$#" -lt 2 ]; -then - echo "Usage: " - exit -1 -fi - -curDir=`pwd` -dataDir=../../demo/binary_classification -trainFile=$dataDir/agaricus.txt.train -input=$2 -output=$2/model - -# generate the training file if it doesnot exist -if [ ! -f "$trainFile" ]; -then - echo "Generating training file:" - cd $dataDir - # map feature using indicator encoding, also produce featmap.txt - python mapfeat.py - # split train and test - python mknfold.py agaricus.txt 1 - cd $curDir -fi - -hadoop fs -mkdir $input -hadoop fs -put $trainFile $input -#hadoop fs -rm -skipTrash -r $output - -# training and output the models -python ../../rabit/tracker/rabit_hadoop.py -n 3 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \ - --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf dsplit=row num_round=3 data=stdin model_out=stdout - -# get the final model file -hadoop fs -get $output/part-00000 ./final.model -# output prediction task=pred -../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=./final.model -# print the boosters of 00002.model in dump.raw.txt -../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt -# use the feature map in printing for better visualization -../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt -cat dump.nice.txt