recheck column mode

2014-11-19 11:21:07 -08:00 · 2014-11-19 11:21:07 -08:00 · 54e2ed90d7
commit 54e2ed90d7
parent dffcbc838b
8 changed files with 51 additions and 93 deletions
--- a/5
+++ b/5
@ -17,9 +17,10 @@ MPIOBJ = sync_mpi.o
 MPIBIN = xgboost-mpi
 SLIB = wrapper/libxgboostwrapper.so 
-.PHONY: clean all python Rpack
+.PHONY: clean all mpi python Rpack
-all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN)
+all: $(BIN) $(OBJ) $(SLIB) 
 mpi: $(MPIBIN)
 python: wrapper/libxgboostwrapper.so
 # now the wrapper takes in two files. io and wrapper part
--- a/demo/mpi/README.md
+++ b/demo/mpi/README.md
@ -1,3 +0,0 @@
 This folder contains toy example script to run xgboost-mpi. 
 This is an experimental distributed version of xgboost
--- a/demo/mpi/mpi.conf
+++ b/demo/mpi/mpi.conf
@ -1,36 +0,0 @@
 # General Parameters, see comment for each definition
 # choose the booster, can be gbtree or gblinear
 booster = gbtree
 # choose logistic regression loss function for binary classification
 objective = binary:logistic
 # Tree Booster Parameters
 # step size shrinkage
 eta = 1.0 
 # minimum loss reduction required to make a further partition
 gamma = 1.0 
 # minimum sum of instance weight(hessian) needed in a child
 min_child_weight = 1 
 # maximum depth of a tree
 max_depth = 3 
 # Task Parameters
 # the number of round to do boosting
 num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0 
 use_buffer = 0
 # The path of training data %d is the wildcard for the rank of the data
 # The idea is each process take a feature matrix with subset of columns
 #
 data = "train.col%d" 
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
 eval[test] = "../data/agaricus.txt.test" 
 # evaluate on training data as well each round
 eval_train = 1
 # The path of test data, need to use full data of test, try not use it, or keep an subsampled version
 test:data = "agaricus.txt.test"      
--- a/demo/mpi/runexp-mpi.sh
+++ b/demo/mpi/runexp-mpi.sh
@ -1,19 +0,0 @@
 #!/bin/bash
 if [[ $# -ne 1 ]]
 then
    echo "Usage: nprocess"
    exit -1
 fi
 rm -rf train.col*
 k=$1
 # split the lib svm file into k subfiles
 python splitsvm.py ../data/agaricus.txt.train train $k
 # run xgboost mpi
 mpirun -n $k ../../xgboost-mpi  mpi.conf 
 # the model can be directly loaded by single machine xgboost solver, as usuall
 ../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
 cat dump.nice.$k.txt
--- a/demo/mpi/splitsvm.py
+++ b/demo/mpi/splitsvm.py
@ -1,32 +0,0 @@
 #!/usr/bin/python
 import sys
 import random
 # split libsvm file into different subcolumns
 if len(sys.argv) < 4:
    print ('Usage:<fin> <fo> k')
    exit(0)
 random.seed(10)
 fmap = {}
 k = int(sys.argv[3])
 fi = open( sys.argv[1], 'r' )
 fos = []
 for i in range(k):
    fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
 for l in open(sys.argv[1]):
    arr = l.split()
    for f in fos:
        f.write(arr[0])
    for it in arr[1:]:
        fid = int(it.split(':')[0])
        if fid not in fmap:
            fmap[fid] = random.randint(0, k-1)
        fos[fmap[fid]].write(' '+it)
    for f in fos:
        f.write('\n')
 for f in fos:    
    f.close()
--- a/multi-node/README.md
+++ b/multi-node/README.md
@ -0,0 +1,33 @@
 Distributed XGBoost
 ======
 This folder contains information about experimental version of distributed xgboost.
 Build
 =====
 * You will need to have MPI
 * In the root folder, run ```make mpi```, this will give you xgboost-mpi
 Design Choice
 =====
 * Does distributed xgboost reply on MPI?
  - Yes, but the dependency is isolated in [sync](../src/sync/sync.h) module
  - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
     if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
 * How is the data distributed?
  - There are two solvers in distributed xgboost
  - Column-based solver split data by column, each node work on subset of columns, 
    it uses exactly the same algorithm as single node version.
  - Row-based solver split data by row, each node work on subset of rows,
    it uses an approximate histogram count algorithm, and will only examine subset of 
    potential split points as opposed to all split points.
 * How to run the distributed version
  - The current code run in MPI enviroment, you will need to have a network filesystem,
    or copy data to local file system before running the code
  - The distributed version is still multi-threading optimized.
    You should run one xgboost-mpi per node that takes most available CPU,
    this will reduce the communication overhead and improve the performance.
  - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
 Examples
 ====
 * [Column-based version](col-split)
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@ -32,6 +32,7 @@ class BoostLearner {
    silent= 0;
    prob_buffer_row = 1.0f;
    part_load_col = 0;
    distributed_mode = 0;
  }
  ~BoostLearner(void) {
    if (obj_ != NULL) delete obj_;
@ -89,6 +90,17 @@ class BoostLearner {
      this->SetParam(n.c_str(), val);
    }
    if (!strcmp(name, "silent")) silent = atoi(val);
    if (!strcmp(name, "dsplit")) {
      if (!strcmp(val, "col")) {
        this->SetParam("updater", "distcol,prune");
        distributed_mode = 1;
      } else if (!strcmp(val, "row")) {
        this->SetParam("updater", "grow_histmaker,prune");
        distributed_mode = 2;
      } else {
        utils::Error("%s is invalid value for dsplit, should be row or col", val);
      }
    }
    if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
    if (!strcmp(name, "prob_buffer_row")) {
      prob_buffer_row = static_cast<float>(atof(val));
@ -352,6 +364,8 @@ class BoostLearner {
  // data fields
  // silent during training
  int silent;
  // distributed learning mode, if any, 0:none, 1:col, 2:row
  int distributed_mode;
  // randomly load part of data
  int part_load_col;
  // maximum buffred row value
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@ -32,7 +32,7 @@ class BoostLearnTask {
      }
    }
    if (sync::IsDistributed()) {
-      this->SetParam("updater", "distcol");
+      this->SetParam("data_split", "col");
    }
    if (sync::GetRank() != 0) {
      this->SetParam("silent", "2");
		`@ -1,3 +0,0 @@`
			`This folder contains toy example script to run xgboost-mpi.`

			`This is an experimental distributed version of xgboost`