From 54e2ed90d7b21aea2d7a302f72d6e9c440d10968 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 19 Nov 2014 11:21:07 -0800
Subject: [PATCH] recheck column mode

---
 Makefile                    |  5 +++--
 demo/mpi/README.md          |  3 ---
 demo/mpi/mpi.conf           | 36 ------------------------------------
 demo/mpi/runexp-mpi.sh      | 19 -------------------
 demo/mpi/splitsvm.py        | 32 --------------------------------
 multi-node/README.md        | 33 +++++++++++++++++++++++++++++++++
 src/learner/learner-inl.hpp | 14 ++++++++++++++
 src/xgboost_main.cpp        |  2 +-
 8 files changed, 51 insertions(+), 93 deletions(-)
 delete mode 100644 demo/mpi/README.md
 delete mode 100644 demo/mpi/mpi.conf
 delete mode 100755 demo/mpi/runexp-mpi.sh
 delete mode 100644 demo/mpi/splitsvm.py
 create mode 100644 multi-node/README.md
diff --git a/Makefile b/Makefile
index 72c981706..51b7a578a 100644
--- a/Makefile
+++ b/Makefile
@@ -17,9 +17,10 @@ MPIOBJ = sync_mpi.o
 MPIBIN = xgboost-mpi
 SLIB = wrapper/libxgboostwrapper.so 
 
-.PHONY: clean all python Rpack
+.PHONY: clean all mpi python Rpack
 
-all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN)
+all: $(BIN) $(OBJ) $(SLIB) 
+mpi: $(MPIBIN)
 
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
diff --git a/demo/mpi/README.md b/demo/mpi/README.md
deleted file mode 100644
index 60fd0eb6e..000000000
--- a/demo/mpi/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This folder contains toy example script to run xgboost-mpi. 
-
-This is an experimental distributed version of xgboost
diff --git a/demo/mpi/mpi.conf b/demo/mpi/mpi.conf
deleted file mode 100644
index 5b1f978d1..000000000
--- a/demo/mpi/mpi.conf
+++ /dev/null
@@ -1,36 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0 
-# minimum loss reduction required to make a further partition
-gamma = 1.0 
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1 
-# maximum depth of a tree
-max_depth = 3 
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0 
-use_buffer = 0
-
-
-# The path of training data %d is the wildcard for the rank of the data
-# The idea is each process take a feature matrix with subset of columns
-#
-data = "train.col%d" 
-
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "../data/agaricus.txt.test" 
-# evaluate on training data as well each round
-eval_train = 1
-
-# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
-test:data = "agaricus.txt.test"      
diff --git a/demo/mpi/runexp-mpi.sh b/demo/mpi/runexp-mpi.sh
deleted file mode 100755
index cc0c6d459..000000000
--- a/demo/mpi/runexp-mpi.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train.col*
-k=$1
-
-# split the lib svm file into k subfiles
-python splitsvm.py ../data/agaricus.txt.train train $k
-
-# run xgboost mpi
-mpirun -n $k ../../xgboost-mpi  mpi.conf 
-
-# the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
-cat dump.nice.$k.txt
diff --git a/demo/mpi/splitsvm.py b/demo/mpi/splitsvm.py
deleted file mode 100644
index 365aef610..000000000
--- a/demo/mpi/splitsvm.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/python
-import sys
-import random
-
-# split libsvm file into different subcolumns
-if len(sys.argv) < 4:
-    print ('Usage:<fin> <fo> k')
-    exit(0)
-
-random.seed(10)
-fmap = {}
-
-k = int(sys.argv[3])
-fi = open( sys.argv[1], 'r' )
-fos = []
-
-for i in range(k):
-    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
-    
-for l in open(sys.argv[1]):
-    arr = l.split()
-    for f in fos:
-        f.write(arr[0])
-    for it in arr[1:]:
-        fid = int(it.split(':')[0])
-        if fid not in fmap:
-            fmap[fid] = random.randint(0, k-1)
-        fos[fmap[fid]].write(' '+it)
-    for f in fos:
-        f.write('\n')
-for f in fos:    
-    f.close()
diff --git a/multi-node/README.md b/multi-node/README.md
new file mode 100644
index 000000000..fab7472e7
--- /dev/null
+++ b/multi-node/README.md
@@ -0,0 +1,33 @@
+Distributed XGBoost
+======
+This folder contains information about experimental version of distributed xgboost.
+
+Build
+=====
+* You will need to have MPI
+* In the root folder, run ```make mpi```, this will give you xgboost-mpi
+
+Design Choice
+=====
+* Does distributed xgboost reply on MPI?
+  - Yes, but the dependency is isolated in [sync](../src/sync/sync.h) module
+  - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
+     if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
+* How is the data distributed?
+  - There are two solvers in distributed xgboost
+  - Column-based solver split data by column, each node work on subset of columns, 
+    it uses exactly the same algorithm as single node version.
+  - Row-based solver split data by row, each node work on subset of rows,
+    it uses an approximate histogram count algorithm, and will only examine subset of 
+    potential split points as opposed to all split points.
+* How to run the distributed version
+  - The current code run in MPI enviroment, you will need to have a network filesystem,
+    or copy data to local file system before running the code
+  - The distributed version is still multi-threading optimized.
+    You should run one xgboost-mpi per node that takes most available CPU,
+    this will reduce the communication overhead and improve the performance.
+  - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
+  
+Examples
+====
+* [Column-based version](col-split)
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 89bc28aec..b1a95dd96 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -32,6 +32,7 @@ class BoostLearner {
     silent= 0;
     prob_buffer_row = 1.0f;
     part_load_col = 0;
+    distributed_mode = 0;
   }
   ~BoostLearner(void) {
     if (obj_ != NULL) delete obj_;
@@ -89,6 +90,17 @@ class BoostLearner {
       this->SetParam(n.c_str(), val);
     }
     if (!strcmp(name, "silent")) silent = atoi(val);
+    if (!strcmp(name, "dsplit")) {
+      if (!strcmp(val, "col")) {
+        this->SetParam("updater", "distcol,prune");
+        distributed_mode = 1;
+      } else if (!strcmp(val, "row")) {
+        this->SetParam("updater", "grow_histmaker,prune");
+        distributed_mode = 2;
+      } else {
+        utils::Error("%s is invalid value for dsplit, should be row or col", val);
+      }
+    }
     if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
     if (!strcmp(name, "prob_buffer_row")) {
       prob_buffer_row = static_cast<float>(atof(val));
@@ -352,6 +364,8 @@ class BoostLearner {
   // data fields
   // silent during training
   int silent;
+  // distributed learning mode, if any, 0:none, 1:col, 2:row
+  int distributed_mode;
   // randomly load part of data
   int part_load_col;
   // maximum buffred row value
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index ef3a9079c..1b596ebfb 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -32,7 +32,7 @@ class BoostLearnTask {
       }
     }
     if (sync::IsDistributed()) {
-      this->SetParam("updater", "distcol");
+      this->SetParam("data_split", "col");
     }
     if (sync::GetRank() != 0) {
       this->SetParam("silent", "2");