get multinode in

2014-11-19 19:19:53 -08:00
parent 7c3a392136
commit c42ba8d281
14 changed files with 157 additions and 23 deletions
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -28,6 +28,7 @@ Design Choice
    this will reduce the communication overhead and improve the performance.
  - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
  
-Examples
+Usage
 ====
 * [Column-based version](col-split)
+* [Row-based version](row-split)
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,6 +1,6 @@
 Distributed XGBoost: Column Split Version
 ====
-* run ```bash run-mushroom.sh```
+* run ```bash mushroom-row.sh <n-mpi-process>```

 How to Use
 ====
--- a/multi-node/row-split/README.md
+++ b/multi-node/row-split/README.md
@@ -0,0 +1,17 @@
+Distributed XGBoost: Row Split Version
+====
+* Mushroom: run ```bash mushroom-row.sh <n-mpi-process>```
+* Machine: run ```bash machine-row.sh <n-mpi-process>```
+
+How to Use
+====
+* First split the data by rows
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable ow split mode by ```dsplit=row```
+
+Notes
+====
+* The code is multi-threaded, so you want to run one xgboost-mpi per node
+* Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm,
+  and will only examine subset of potential split points as opposed to all split points.
+* ```colsample_bytree``` is not enabled in row split mode so far
--- a/multi-node/row-split/machine-row.conf
+++ b/multi-node/row-split/machine-row.conf
@@ -0,0 +1,31 @@
+# General Parameters, see comment for each definition
+# choose the tree booster, can also change to gblinear
+booster = gbtree
+# this is the only difference with classification, use reg:linear to do linear classification
+# when labels are in [0,1] we can also use reg:logistic
+objective = reg:linear
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data
+data = "train-machine.row%d" 
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/regression/machine.txt.test" 
+# The path of test data 
+test:data = "../../demo/regression/machine.txt.test" 
+
--- a/multi-node/row-split/machine-row.sh
+++ b/multi-node/row-split/machine-row.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row 
+
--- a/multi-node/row-split/mushroom-row.conf
+++ b/multi-node/row-split/mushroom-row.conf
@@ -0,0 +1,35 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.row%d" 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/data/agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "../../demo/data/agaricus.txt.test"      
--- a/multi-node/row-split/mushroom-row.sh
+++ b/multi-node/row-split/mushroom-row.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train.row* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi mushroom-row.conf dsplit=row nthread=1
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-row.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+cat dump.nice.$k.txt