From d5e9b1d4eadf9831cba51b3ef52106696c3082f1 Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sun, 11 Jan 2015 13:08:52 +0800 Subject: [PATCH 1/4] delete hadoop conf --- .../mushroom.hadoop.conf | 31 ------------------- 1 file changed, 31 deletions(-) delete mode 100644 demo/binary_classification/mushroom.hadoop.conf diff --git a/demo/binary_classification/mushroom.hadoop.conf b/demo/binary_classification/mushroom.hadoop.conf deleted file mode 100644 index 1dffe4f8d..000000000 --- a/demo/binary_classification/mushroom.hadoop.conf +++ /dev/null @@ -1,31 +0,0 @@ -# General Parameters, see comment for each definition -# choose the booster, can be gbtree or gblinear -booster = gbtree -# choose logistic regression loss function for binary classification -objective = binary:logistic - -# Tree Booster Parameters -# step size shrinkage -eta = 1.0 -# minimum loss reduction required to make a further partition -gamma = 1.0 -# minimum sum of instance weight(hessian) needed in a child -min_child_weight = 1 -# maximum depth of a tree -max_depth = 3 - -# Task Parameters -# the number of round to do boosting -num_round = 2 -# 0 means do not save any model except the final round model -save_period = 0 -# The path of training data -data = "agaricus.txt.train" - -# The following parameters are not supported by xgboost running in hadoop yet! -# The path of validation data, used to monitor training process, here [test] sets name of the validation set -#eval[test] = "agaricus.txt.test" -# evaluate on training data as well each round -#eval_train = 1 -# The path of test data -#test:data = "agaricus.txt.test" From 2f95968a1ca46e4ade45765a612e473cb917a45f Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sun, 11 Jan 2015 15:34:55 +0800 Subject: [PATCH 2/4] ok --- multi-node/hadoop/run_hadoop_mushroom.sh | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 multi-node/hadoop/run_hadoop_mushroom.sh diff --git a/multi-node/hadoop/run_hadoop_mushroom.sh b/multi-node/hadoop/run_hadoop_mushroom.sh new file mode 100755 index 000000000..2f095ff25 --- /dev/null +++ b/multi-node/hadoop/run_hadoop_mushroom.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +if [ "$#" -lt 2 ]; +then + echo "Usage: " + exit -1 +fi + +# put the local training file to HDFS +hadoop fs -mkdir $2/data +hadoop fs -put ../../demo/data/agaricus.txt.train $2/data + +# training and output the final model file +../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \ + -o $2/model -f ../../demo/data/agaricus.txt.test \ + ../../xgboost mushroom.hadoop.conf dsplit=row + +# get the final model file +hadoop fs -get $2/model/part-00000 ./final.model + +# output prediction task=pred +../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \ + test:data=../../demo/data/agaricus.txt.test +# print the boosters of final.model in dump.raw.txt +../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt +# use the feature map in printing for better visualization +../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \ + fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt +cat dump.nice.txt From fb65356dd421662d4e6608c940f123cce9f87d03 Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sun, 11 Jan 2015 15:41:46 +0800 Subject: [PATCH 3/4] change file name --- .../hadoop/run_binary_classification.sh | 43 ------------------- 1 file changed, 43 deletions(-) delete mode 100755 multi-node/hadoop/run_binary_classification.sh diff --git a/multi-node/hadoop/run_binary_classification.sh b/multi-node/hadoop/run_binary_classification.sh deleted file mode 100755 index 740a468cf..000000000 --- a/multi-node/hadoop/run_binary_classification.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -if [ "$#" -lt 2 ]; -then - echo "Usage: " - exit -1 -fi - -curDir=`pwd` -dataDir=../../demo/binary_classification -trainFile=$dataDir/agaricus.txt.train -input=$2 -output=$2/model - -# generate the training file if it doesnot exist -if [ ! -f "$trainFile" ]; -then - echo "Generating training file:" - cd $dataDir - # map feature using indicator encoding, also produce featmap.txt - python mapfeat.py - # split train and test - python mknfold.py agaricus.txt 1 - cd $curDir -fi - -hadoop fs -mkdir $input -hadoop fs -put $trainFile $input -#hadoop fs -rm -skipTrash -r $output - -# training and output the final model file -python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \ - --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout - -# get the final model file -hadoop fs -get $output/part-00000 ./final.model -# output prediction task=pred -../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model -# print the boosters of 00002.model in dump.raw.txt -../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt -# use the feature map in printing for better visualization -../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt -cat dump.nice.txt From ceabf5755f447c40a22adcaaef510aceff2723e5 Mon Sep 17 00:00:00 2001 From: Boliang Chen Date: Sun, 11 Jan 2015 15:44:16 +0800 Subject: [PATCH 4/4] hadoop version conf --- multi-node/hadoop/mushroom.hadoop.conf | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 multi-node/hadoop/mushroom.hadoop.conf diff --git a/multi-node/hadoop/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf new file mode 100644 index 000000000..305b82dd3 --- /dev/null +++ b/multi-node/hadoop/mushroom.hadoop.conf @@ -0,0 +1,30 @@ +# General Parameters, see comment for each definition +# choose the booster, can be gbtree or gblinear +booster = gbtree +# choose logistic regression loss function for binary classification +objective = binary:logistic + +# Tree Booster Parameters +# step size shrinkage +eta = 1.0 +# minimum loss reduction required to make a further partition +gamma = 1.0 +# minimum sum of instance weight(hessian) needed in a child +min_child_weight = 1 +# maximum depth of a tree +max_depth = 3 + +# Task Parameters +# the number of round to do boosting +num_round = 2 +# 0 means do not save any model except the final round model +save_period = 0 +# The path of training data +data = stdin +# The path of model file +model_out = stdout + +# The path of validation data, used to monitor training process, here [test] sets name of the validation set +eval[test] = "agaricus.txt.test" +# evaluate on training data as well each round +eval_train = 1