From d5e9b1d4eadf9831cba51b3ef52106696c3082f1 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 13:08:52 +0800
Subject: [PATCH 1/4] delete hadoop conf

---
 .../mushroom.hadoop.conf                      | 31 -------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 demo/binary_classification/mushroom.hadoop.conf
diff --git a/demo/binary_classification/mushroom.hadoop.conf b/demo/binary_classification/mushroom.hadoop.conf
deleted file mode 100644
index 1dffe4f8d..000000000
--- a/demo/binary_classification/mushroom.hadoop.conf
+++ /dev/null
@@ -1,31 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0 
-# minimum loss reduction required to make a further partition
-gamma = 1.0 
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1 
-# maximum depth of a tree
-max_depth = 3 
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0 
-# The path of training data
-data = "agaricus.txt.train" 
-
-# The following parameters are not supported by xgboost running in hadoop yet!
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-#eval[test] = "agaricus.txt.test" 
-# evaluate on training data as well each round
-#eval_train = 1
-# The path of test data 
-#test:data = "agaricus.txt.test"      

From 2f95968a1ca46e4ade45765a612e473cb917a45f Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 15:34:55 +0800
Subject: [PATCH 2/4] ok

---
 multi-node/hadoop/run_hadoop_mushroom.sh | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100755 multi-node/hadoop/run_hadoop_mushroom.sh

diff --git a/multi-node/hadoop/run_hadoop_mushroom.sh b/multi-node/hadoop/run_hadoop_mushroom.sh
new file mode 100755
index 000000000..2f095ff25
--- /dev/null
+++ b/multi-node/hadoop/run_hadoop_mushroom.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ "$#" -lt 2 ];
+then
+    echo "Usage: <num_of_slave_nodes> <path_in_HDFS>"
+    exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -mkdir $2/data
+hadoop fs -put ../../demo/data/agaricus.txt.train $2/data
+
+# training and output the final model file
+../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \
+    -o $2/model -f ../../demo/data/agaricus.txt.test \
+    ../../xgboost mushroom.hadoop.conf dsplit=row 
+
+# get the final model file
+hadoop fs -get $2/model/part-00000 ./final.model
+
+# output prediction task=pred 
+../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \
+    test:data=../../demo/data/agaricus.txt.test
+# print the boosters of final.model in dump.raw.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \
+    fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt

From fb65356dd421662d4e6608c940f123cce9f87d03 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 15:41:46 +0800
Subject: [PATCH 3/4] change file name

---
 .../hadoop/run_binary_classification.sh       | 43 -------------------
 1 file changed, 43 deletions(-)
 delete mode 100755 multi-node/hadoop/run_binary_classification.sh

diff --git a/multi-node/hadoop/run_binary_classification.sh b/multi-node/hadoop/run_binary_classification.sh
deleted file mode 100755
index 740a468cf..000000000
--- a/multi-node/hadoop/run_binary_classification.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -lt 2 ];
-then
-    echo "Usage: <nworkers> <path_in_HDFS>"
-    exit -1
-fi
-
-curDir=`pwd`
-dataDir=../../demo/binary_classification
-trainFile=$dataDir/agaricus.txt.train
-input=$2
-output=$2/model
-
-# generate the training file if it doesnot exist
-if [ ! -f "$trainFile" ];
-then 
-  echo "Generating training file:"
-  cd $dataDir
-  # map feature using indicator encoding, also produce featmap.txt
-  python mapfeat.py
-  # split train and test
-  python mknfold.py agaricus.txt 1
-  cd $curDir
-fi
-
-hadoop fs -mkdir $input
-hadoop fs -put $trainFile $input
-#hadoop fs -rm -skipTrash -r $output
-
-# training and output the final model file
-python ../../rabit/tracker/rabit_hadoop.py -n $1 -i $input/agaricus.txt.train -o $output -f $dataDir/mushroom.hadoop.conf \
-    --jobname xgboost_hadoop ../../xgboost mushroom.hadoop.conf data=stdin model_out=stdout
-
-# get the final model file
-hadoop fs -get $output/part-00000 ./final.model
-# output prediction task=pred 
-../../xgboost $dataDir/mushroom.hadoop.conf task=pred model_in=final.model
-# print the boosters of 00002.model in dump.raw.txt
-../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
-# use the feature map in printing for better visualization
-../../xgboost $dataDir/mushroom.hadoop.conf task=dump model_in=final.model fmap=$dataDir/featmap.txt name_dump=dump.nice.txt
-cat dump.nice.txt

From ceabf5755f447c40a22adcaaef510aceff2723e5 Mon Sep 17 00:00:00 2001
From: Boliang Chen <cblsjtu@gmail.com>
Date: Sun, 11 Jan 2015 15:44:16 +0800
Subject: [PATCH 4/4] hadoop version conf

---
 multi-node/hadoop/mushroom.hadoop.conf | 30 ++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 multi-node/hadoop/mushroom.hadoop.conf

diff --git a/multi-node/hadoop/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf
new file mode 100644
index 000000000..305b82dd3
--- /dev/null
+++ b/multi-node/hadoop/mushroom.hadoop.conf
@@ -0,0 +1,30 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+# The path of training data
+data = stdin
+# The path of model file
+model_out = stdout 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1