From da54f5e5d860b81284325ac93d82b5f1b78d7026 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 11:37:54 -0800
Subject: [PATCH] add note for col

---
 multi-node/col-split/README.md       | 14 ++++++++++++++
 multi-node/col-split/run-mushroom.sh | 19 +++++++++++++++++++
 multi-node/col-split/runexp-mpi.sh   |  4 ++--
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100755 multi-node/col-split/run-mushroom.sh
 mode change 100755 => 100644 multi-node/col-split/runexp-mpi.sh

diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md
index 14fe993d2..b3053080f 100644
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@@ -1,2 +1,16 @@
 Column Split Version of XGBoost
 ====
+* run ```bash run-mushroom.sh```
+
+Steps to use column split version
+====
+* First split the data by column, 
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable column split mode by ```dsplit=col```
+
+Note on the Column Split Version
+====
+* The code is multi-threaded, so you want to run one xgboost-mpi per node
+* The code will work correctly as long as union of each column subset is all the columns we are interested in.
+  - The column subset can overlap with each other.
+* It uses exactly the same algorithm as single node version, to examine all potential split points.
diff --git a/multi-node/col-split/run-mushroom.sh b/multi-node/col-split/run-mushroom.sh
new file mode 100755
index 000000000..5c4c06587
--- /dev/null
+++ b/multi-node/col-split/run-mushroom.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train.col*
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi  mushroom-col.conf updater=distcol silent=0
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+cat dump.nice.$k.txt
diff --git a/multi-node/col-split/runexp-mpi.sh b/multi-node/col-split/runexp-mpi.sh
old mode 100755
new mode 100644
index d5469e714..906ace94c
--- a/multi-node/col-split/runexp-mpi.sh
+++ b/multi-node/col-split/runexp-mpi.sh
@@ -12,8 +12,8 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-mpirun -n $k ../../xgboost-mpi  mushroom-col.conf dsplit=col
+mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mpi.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
 cat dump.nice.$k.txt