recheck column mode

2014-11-19 11:21:07 -08:00
parent dffcbc838b
commit 54e2ed90d7
8 changed files with 51 additions and 93 deletions
--- a/5
+++ b/5
@@ -17,9 +17,10 @@ MPIOBJ = sync_mpi.o
 MPIBIN = xgboost-mpi
 SLIB = wrapper/libxgboostwrapper.so 

-.PHONY: clean all python Rpack
+.PHONY: clean all mpi python Rpack

-all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN)
+all: $(BIN) $(OBJ) $(SLIB) 
+mpi: $(MPIBIN)

 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
--- a/demo/mpi/README.md
+++ b/demo/mpi/README.md
@@ -1,3 +0,0 @@
-This folder contains toy example script to run xgboost-mpi. 
-
-This is an experimental distributed version of xgboost
--- a/demo/mpi/mpi.conf
+++ b/demo/mpi/mpi.conf
@@ -1,36 +0,0 @@
-# General Parameters, see comment for each definition
-# choose the booster, can be gbtree or gblinear
-booster = gbtree
-# choose logistic regression loss function for binary classification
-objective = binary:logistic
-
-# Tree Booster Parameters
-# step size shrinkage
-eta = 1.0 
-# minimum loss reduction required to make a further partition
-gamma = 1.0 
-# minimum sum of instance weight(hessian) needed in a child
-min_child_weight = 1 
-# maximum depth of a tree
-max_depth = 3 
-
-# Task Parameters
-# the number of round to do boosting
-num_round = 2
-# 0 means do not save any model except the final round model
-save_period = 0 
-use_buffer = 0
-
-
-# The path of training data %d is the wildcard for the rank of the data
-# The idea is each process take a feature matrix with subset of columns
-#
-data = "train.col%d" 
-
-# The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "../data/agaricus.txt.test" 
-# evaluate on training data as well each round
-eval_train = 1
-
-# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
-test:data = "agaricus.txt.test"      
--- a/demo/mpi/runexp-mpi.sh
+++ b/demo/mpi/runexp-mpi.sh
@@ -1,19 +0,0 @@
-#!/bin/bash
-if [[ $# -ne 1 ]]
-then
-    echo "Usage: nprocess"
-    exit -1
-fi
-
-rm -rf train.col*
-k=$1
-
-# split the lib svm file into k subfiles
-python splitsvm.py ../data/agaricus.txt.train train $k
-
-# run xgboost mpi
-mpirun -n $k ../../xgboost-mpi  mpi.conf 
-
-# the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
-cat dump.nice.$k.txt
--- a/demo/mpi/splitsvm.py
+++ b/demo/mpi/splitsvm.py
@@ -1,32 +0,0 @@
-#!/usr/bin/python
-import sys
-import random
-
-# split libsvm file into different subcolumns
-if len(sys.argv) < 4:
-    print ('Usage:<fin> <fo> k')
-    exit(0)
-
-random.seed(10)
-fmap = {}
-
-k = int(sys.argv[3])
-fi = open( sys.argv[1], 'r' )
-fos = []
-
-for i in range(k):
-    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
-    
-for l in open(sys.argv[1]):
-    arr = l.split()
-    for f in fos:
-        f.write(arr[0])
-    for it in arr[1:]:
-        fid = int(it.split(':')[0])
-        if fid not in fmap:
-            fmap[fid] = random.randint(0, k-1)
-        fos[fmap[fid]].write(' '+it)
-    for f in fos:
-        f.write('\n')
-for f in fos:    
-    f.close()
--- a/multi-node/README.md
+++ b/multi-node/README.md
@@ -0,0 +1,33 @@
+Distributed XGBoost
+======
+This folder contains information about experimental version of distributed xgboost.
+
+Build
+=====
+* You will need to have MPI
+* In the root folder, run ```make mpi```, this will give you xgboost-mpi
+
+Design Choice
+=====
+* Does distributed xgboost reply on MPI?
+  - Yes, but the dependency is isolated in [sync](../src/sync/sync.h) module
+  - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
+     if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
+* How is the data distributed?
+  - There are two solvers in distributed xgboost
+  - Column-based solver split data by column, each node work on subset of columns, 
+    it uses exactly the same algorithm as single node version.
+  - Row-based solver split data by row, each node work on subset of rows,
+    it uses an approximate histogram count algorithm, and will only examine subset of 
+    potential split points as opposed to all split points.
+* How to run the distributed version
+  - The current code run in MPI enviroment, you will need to have a network filesystem,
+    or copy data to local file system before running the code
+  - The distributed version is still multi-threading optimized.
+    You should run one xgboost-mpi per node that takes most available CPU,
+    this will reduce the communication overhead and improve the performance.
+  - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
+  
+Examples
+====
+* [Column-based version](col-split)
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -32,6 +32,7 @@ class BoostLearner {
    silent= 0;
    prob_buffer_row = 1.0f;
    part_load_col = 0;
+    distributed_mode = 0;
  }
  ~BoostLearner(void) {
    if (obj_ != NULL) delete obj_;
@@ -89,6 +90,17 @@ class BoostLearner {
      this->SetParam(n.c_str(), val);
    }
    if (!strcmp(name, "silent")) silent = atoi(val);
+    if (!strcmp(name, "dsplit")) {
+      if (!strcmp(val, "col")) {
+        this->SetParam("updater", "distcol,prune");
+        distributed_mode = 1;
+      } else if (!strcmp(val, "row")) {
+        this->SetParam("updater", "grow_histmaker,prune");
+        distributed_mode = 2;
+      } else {
+        utils::Error("%s is invalid value for dsplit, should be row or col", val);
+      }
+    }
    if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
    if (!strcmp(name, "prob_buffer_row")) {
      prob_buffer_row = static_cast<float>(atof(val));
@@ -352,6 +364,8 @@ class BoostLearner {
  // data fields
  // silent during training
  int silent;
+  // distributed learning mode, if any, 0:none, 1:col, 2:row
+  int distributed_mode;
  // randomly load part of data
  int part_load_col;
  // maximum buffred row value
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -32,7 +32,7 @@ class BoostLearnTask {
      }
    }
    if (sync::IsDistributed()) {
-      this->SetParam("updater", "distcol");
+      this->SetParam("data_split", "col");
    }
    if (sync::GetRank() != 0) {
      this->SetParam("silent", "2");