From 69e079941e2612cb6379a97126b3d33caef66af8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 10 Jan 2015 23:46:29 -0800 Subject: [PATCH 1/9] allow pred to stdout --- src/xgboost_main.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp index 9440c791a..db37cbd1d 100644 --- a/src/xgboost_main.cpp +++ b/src/xgboost_main.cpp @@ -32,7 +32,7 @@ class BoostLearnTask { } } // do not save anything when save to stdout - if (model_out == "stdout") { + if (model_out == "stdout" || name_pred == "stdout") { this->SetParam("silent", "1"); save_period = 0; } @@ -235,12 +235,17 @@ class BoostLearnTask { std::vector preds; if (!silent) printf("start prediction...\n"); learner.Predict(*data, pred_margin != 0, &preds, ntree_limit); - if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); - FILE *fo = utils::FopenCheck(name_pred.c_str(), "w"); - for (size_t i = 0; i < preds.size(); i++) { - fprintf(fo, "%f\n", preds[i]); + if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); + FILE *fo; + if (name_pred != "stdout") { + fo = utils::FopenCheck(name_pred.c_str(), "w"); + } else { + fo = stdout; } - fclose(fo); + for (size_t i = 0; i < preds.size(); ++i) { + fprintf(fo, "%g\n", preds[i]); + } + if (fo != stdout) fclose(fo); } private: /*! \brief whether silent */ From 0111a14aef2ecd2bbc98ec8e0b4111b01c8b52d6 Mon Sep 17 00:00:00 2001 From: chenshuaihua Date: Sun, 11 Jan 2015 23:57:52 +0800 Subject: [PATCH 2/9] yarn script --- multi-node/hadoop/run_yarn_mushroom.sh | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 multi-node/hadoop/run_yarn_mushroom.sh diff --git a/multi-node/hadoop/run_yarn_mushroom.sh b/multi-node/hadoop/run_yarn_mushroom.sh new file mode 100644 index 000000000..07ac291d1 --- /dev/null +++ b/multi-node/hadoop/run_yarn_mushroom.sh @@ -0,0 +1,29 @@ +#!/bin/bash +if [ "$#" -lt 3 ]; +then + echo "Usage: " + exit -1 +fi + +# put the local training file to HDFS +hadoop fs -mkdir $3/data +hadoop fs -put ../../demo/data/agaricus.txt.train $3/data + + +python ../../rabit/tracker/rabit_yarn.py -nw $1 -nt $2 -f ../../demo/data/agaricus.txt.test \ + -i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf nthread=$2 dsplit=row + + + +# get the final model file +hadoop fs -get $3/model/part-00000 ./final.model + +# output prediction task=pred +../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \ + test:data=../../demo/data/agaricus.txt.test +# print the boosters of final.model in dump.raw.txt +../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt +# use the feature map in printing for better visualization +../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \ +fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt +cat dump.nice.txt From 62a108a7c2906ca6aebe3c3a41f359781edfbbd0 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 11 Jan 2015 21:02:38 -0800 Subject: [PATCH 3/9] chg of hadoop script --- multi-node/hadoop/README.md | 10 +++---- multi-node/hadoop/run_hadoop_mushroom.sh | 29 ------------------- .../{run_yarn_mushroom.sh => run_mushroom.sh} | 12 ++------ 3 files changed, 8 insertions(+), 43 deletions(-) delete mode 100755 multi-node/hadoop/run_hadoop_mushroom.sh rename multi-node/hadoop/{run_yarn_mushroom.sh => run_mushroom.sh} (68%) mode change 100644 => 100755 diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md index adfacdb8b..a3411fee4 100644 --- a/multi-node/hadoop/README.md +++ b/multi-node/hadoop/README.md @@ -1,6 +1,6 @@ Distributed XGBoost: Hadoop Version ==== -* Hadoop version: run ```bash run_binary_classification.sh ``` +* Hadoop version: run ```bash run_binary_classification.sh ``` - This is the hadoop version of binary classification example in the demo folder. How to Use @@ -9,7 +9,7 @@ How to Use Notes ==== -* The code has been tested on MapReduce 1 (MRv1), it should be ok to run on MapReduce 2 (MRv2, YARN). -* The code is multi-threaded, so you want to run one xgboost per node/worker, which means the parameter should be less than the number of slaves/workers. -* The hadoop version now can only save the final model and evaluate test data locally after the training process. - +* The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN). +* The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set to be number of cores you have on each machine. + - You will need YARN to set specify number of cores of each worker +* The hadoop version save the final model into HDFS diff --git a/multi-node/hadoop/run_hadoop_mushroom.sh b/multi-node/hadoop/run_hadoop_mushroom.sh deleted file mode 100755 index 2f095ff25..000000000 --- a/multi-node/hadoop/run_hadoop_mushroom.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -if [ "$#" -lt 2 ]; -then - echo "Usage: " - exit -1 -fi - -# put the local training file to HDFS -hadoop fs -mkdir $2/data -hadoop fs -put ../../demo/data/agaricus.txt.train $2/data - -# training and output the final model file -../../rabit/tracker/rabit_hadoop.py -n $1 -i $2/data/agaricus.txt.train \ - -o $2/model -f ../../demo/data/agaricus.txt.test \ - ../../xgboost mushroom.hadoop.conf dsplit=row - -# get the final model file -hadoop fs -get $2/model/part-00000 ./final.model - -# output prediction task=pred -../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \ - test:data=../../demo/data/agaricus.txt.test -# print the boosters of final.model in dump.raw.txt -../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt -# use the feature map in printing for better visualization -../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \ - fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt -cat dump.nice.txt diff --git a/multi-node/hadoop/run_yarn_mushroom.sh b/multi-node/hadoop/run_mushroom.sh old mode 100644 new mode 100755 similarity index 68% rename from multi-node/hadoop/run_yarn_mushroom.sh rename to multi-node/hadoop/run_mushroom.sh index 07ac291d1..1e647047f --- a/multi-node/hadoop/run_yarn_mushroom.sh +++ b/multi-node/hadoop/run_mushroom.sh @@ -9,21 +9,15 @@ fi hadoop fs -mkdir $3/data hadoop fs -put ../../demo/data/agaricus.txt.train $3/data - -python ../../rabit/tracker/rabit_yarn.py -nw $1 -nt $2 -f ../../demo/data/agaricus.txt.test \ - -i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf nthread=$2 dsplit=row - - +../../rabit/tracker/rabit_hadoop.py -n $1 -nt $2 -i $3/data/agaricus.txt.train -o $3/model ../../xgboost mushroom.hadoop.conf nthread=$2 # get the final model file hadoop fs -get $3/model/part-00000 ./final.model # output prediction task=pred -../../xgboost mushroom.hadoop.conf task=pred model_in=final.model \ - test:data=../../demo/data/agaricus.txt.test +../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test # print the boosters of final.model in dump.raw.txt ../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt # use the feature map in printing for better visualization -../../xgboost mushroom.hadoop.conf task=dump model_in=final.model \ -fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt +../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt cat dump.nice.txt From d57cb4f17b1770c7ed29524012bb1762dc7b6323 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 12 Jan 2015 09:02:53 -0800 Subject: [PATCH 4/9] Update mushroom.hadoop.conf --- multi-node/hadoop/mushroom.hadoop.conf | 2 -- 1 file changed, 2 deletions(-) diff --git a/multi-node/hadoop/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf index 305b82dd3..a40c950a7 100644 --- a/multi-node/hadoop/mushroom.hadoop.conf +++ b/multi-node/hadoop/mushroom.hadoop.conf @@ -24,7 +24,5 @@ data = stdin # The path of model file model_out = stdout -# The path of validation data, used to monitor training process, here [test] sets name of the validation set -eval[test] = "agaricus.txt.test" # evaluate on training data as well each round eval_train = 1 From 5e0e8a5ff7da9320656299449df74ec535e78fce Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 11:47:46 -0800 Subject: [PATCH 5/9] changes --- multi-node/hadoop/README.md | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md index ce514be2e..f2afc6a1d 100644 --- a/multi-node/hadoop/README.md +++ b/multi-node/hadoop/README.md @@ -2,16 +2,17 @@ Distributed XGBoost: Hadoop Version ==== * The script in this fold shows an example of how to run distributed xgboost on hadoop platform. * It relies on [Rabit Library](https://github.com/tqchen/rabit) and Hadoop Streaming. -* Quick start: run ```bash run_binary_classification.sh ``` +* Quick start: run ```bash run_binary_classification.sh ``` - This is the hadoop version of binary classification example in the demo folder. - - More info of the binary classification task can be refered to https://github.com/tqchen/xgboost/wiki/Binary-Classification. + - More info of the usage of xgboost can be refered to [wiki page](https://github.com/tqchen/xgboost/wiki) Before you run the script ==== -* Make sure you have set up the hadoop environment. Otherwise you should run single machine examples in the demo fold. +* Make sure you have set up the hadoop environment. +* If you want to only use single machine multi-threading, tryout single machine examples in the [demo folder](../../demo). * Build: run ```bash build.sh``` in the root folder, it will automatically download rabit and build xgboost. -* Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, plz set up hadoop-streaming.jar path in rabit_hadoop.py. - +* Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, please set up hadoop-streaming.jar path in rabit_hadoop.py. + How to Use ==== * Input data format: LIBSVM format. The example here uses generated data in demo/data folder. @@ -19,25 +20,22 @@ How to Use * Use rabit ```rabit_hadoop.py``` to submit training task to hadoop, and save the final model file. * Get the final model file from HDFS, and locally do prediction as well as visualization of model. -XGBoost: Single machine verison VS Hadoop version +XGBoost: Single machine verison vs Hadoop version ==== If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file. -* Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first. -* IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data = stdin; model_out = stdout```. +* Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first. +* IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data=stdin``` and ```model_out=stdout```. * File cache: ```rabit_hadoop.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file, small size of dataset which used for eveluation during the training process, and so on. - Any file used in config file, excluding stdin, should be cached in the script. ```rabit_hadoop.py``` will automatically cache files in the command line. For example, ```rabit_hadoop.py -n 3 -i $hdfsPath/agaricus.txt.train -o $hdfsPath/mushroom.final.model $localPath/xgboost mushroom.hadoop.conf``` will cache "xgboost" and "mushroom.hadoop.conf". - You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2``` (use "#" to spilt file names). - The local path of cached files in command is "./". - Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large. For instance, trying to cache files of GB size may reduce the performance. -* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train" and "eval[test]" in conf file and cache the evaluation file. -* Hadoop version now can only save the final model. -* Predict locally. Althought the hadoop version supports training process, you should do prediction locally, just the same as single machine version. -* The hadoop version now can only save the final model. -* More details of hadoop version can be referred to the usage of ```rabit_hadoop.py```. +* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train". +* Hadoop version now only saves the final model. +* More details of submission can be referred to the usage of ```rabit_hadoop.py```. Notes ==== * The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN). * The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set to be number of cores you have on each machine. - You will need YARN to set specify number of cores of each worker - From 6b7f20c002a0ed8b5b826d6c9f8f481e6208a508 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 11:49:42 -0800 Subject: [PATCH 6/9] chgs --- multi-node/hadoop/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md index f2afc6a1d..13e68c4f0 100644 --- a/multi-node/hadoop/README.md +++ b/multi-node/hadoop/README.md @@ -20,7 +20,7 @@ How to Use * Use rabit ```rabit_hadoop.py``` to submit training task to hadoop, and save the final model file. * Get the final model file from HDFS, and locally do prediction as well as visualization of model. -XGBoost: Single machine verison vs Hadoop version +Single machine vs Hadoop version ==== If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file. * Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first. @@ -31,8 +31,8 @@ If you have used xgboost (single machine version) before, this section will show - The local path of cached files in command is "./". - Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large. For instance, trying to cache files of GB size may reduce the performance. * Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train". -* Hadoop version now only saves the final model. -* More details of submission can be referred to the usage of ```rabit_hadoop.py```. +* More details of submission can be referred to the usage of ```rabit_hadoop.py```. +* The model saved by hadoop version is compatible with single machine version. Notes ==== From 2a9a864b11ec6bd350f7e7b667d26c39a92777d6 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 11:50:18 -0800 Subject: [PATCH 7/9] ok --- multi-node/hadoop/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md index 13e68c4f0..d98068fe9 100644 --- a/multi-node/hadoop/README.md +++ b/multi-node/hadoop/README.md @@ -9,7 +9,7 @@ Distributed XGBoost: Hadoop Version Before you run the script ==== * Make sure you have set up the hadoop environment. -* If you want to only use single machine multi-threading, tryout single machine examples in the [demo folder](../../demo). +* If you want to only use single machine multi-threading, try single machine examples in the [demo folder](../../demo). * Build: run ```bash build.sh``` in the root folder, it will automatically download rabit and build xgboost. * Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, please set up hadoop-streaming.jar path in rabit_hadoop.py. From 9346c328cb721e94e1fa399765a47740a8fe96b8 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 11:53:40 -0800 Subject: [PATCH 8/9] chg --- multi-node/hadoop/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md index d98068fe9..e03f3a592 100644 --- a/multi-node/hadoop/README.md +++ b/multi-node/hadoop/README.md @@ -2,7 +2,7 @@ Distributed XGBoost: Hadoop Version ==== * The script in this fold shows an example of how to run distributed xgboost on hadoop platform. * It relies on [Rabit Library](https://github.com/tqchen/rabit) and Hadoop Streaming. -* Quick start: run ```bash run_binary_classification.sh ``` +* Quick start: run ```bash run_mushroom.sh ``` - This is the hadoop version of binary classification example in the demo folder. - More info of the usage of xgboost can be refered to [wiki page](https://github.com/tqchen/xgboost/wiki) From a53f0cd9bf9ea5ed78a8ec28ec19f5ca5b8702ea Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 12 Jan 2015 11:55:42 -0800 Subject: [PATCH 9/9] doc chg --- multi-node/hadoop/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md index e03f3a592..e513c59cf 100644 --- a/multi-node/hadoop/README.md +++ b/multi-node/hadoop/README.md @@ -35,7 +35,9 @@ If you have used xgboost (single machine version) before, this section will show * The model saved by hadoop version is compatible with single machine version. Notes -==== -* The code has been tested on MapReduce 1 (MRv1) and YARN, it recommended run on MapReduce 2 (MRv2, YARN). -* The code is multi-threaded, so you want to run one xgboost per node/worker, which means you want to set to be number of cores you have on each machine. +==== +* The code has been tested on MapReduce 1 (MRv1) and YARN. + - We recommend to run it on MapReduce 2 (MRv2, YARN) so that multi-threading can be enabled. +* The code is optimized with multi-threading, so you will want to run one xgboost per node/worker for best performance. + - You will want to set to be number of cores you have on each machine. - You will need YARN to set specify number of cores of each worker