From 54e2ed90d7b21aea2d7a302f72d6e9c440d10968 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 19 Nov 2014 11:21:07 -0800 Subject: [PATCH] recheck column mode --- Makefile | 5 +++-- demo/mpi/README.md | 3 --- demo/mpi/mpi.conf | 36 ------------------------------------ demo/mpi/runexp-mpi.sh | 19 ------------------- demo/mpi/splitsvm.py | 32 -------------------------------- multi-node/README.md | 33 +++++++++++++++++++++++++++++++++ src/learner/learner-inl.hpp | 14 ++++++++++++++ src/xgboost_main.cpp | 2 +- 8 files changed, 51 insertions(+), 93 deletions(-) delete mode 100644 demo/mpi/README.md delete mode 100644 demo/mpi/mpi.conf delete mode 100755 demo/mpi/runexp-mpi.sh delete mode 100644 demo/mpi/splitsvm.py create mode 100644 multi-node/README.md diff --git a/Makefile b/Makefile index 72c981706..51b7a578a 100644 --- a/Makefile +++ b/Makefile @@ -17,9 +17,10 @@ MPIOBJ = sync_mpi.o MPIBIN = xgboost-mpi SLIB = wrapper/libxgboostwrapper.so -.PHONY: clean all python Rpack +.PHONY: clean all mpi python Rpack -all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN) +all: $(BIN) $(OBJ) $(SLIB) +mpi: $(MPIBIN) python: wrapper/libxgboostwrapper.so # now the wrapper takes in two files. io and wrapper part diff --git a/demo/mpi/README.md b/demo/mpi/README.md deleted file mode 100644 index 60fd0eb6e..000000000 --- a/demo/mpi/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This folder contains toy example script to run xgboost-mpi. - -This is an experimental distributed version of xgboost diff --git a/demo/mpi/mpi.conf b/demo/mpi/mpi.conf deleted file mode 100644 index 5b1f978d1..000000000 --- a/demo/mpi/mpi.conf +++ /dev/null @@ -1,36 +0,0 @@ -# General Parameters, see comment for each definition -# choose the booster, can be gbtree or gblinear -booster = gbtree -# choose logistic regression loss function for binary classification -objective = binary:logistic - -# Tree Booster Parameters -# step size shrinkage -eta = 1.0 -# minimum loss reduction required to make a further partition -gamma = 1.0 -# minimum sum of instance weight(hessian) needed in a child -min_child_weight = 1 -# maximum depth of a tree -max_depth = 3 - -# Task Parameters -# the number of round to do boosting -num_round = 2 -# 0 means do not save any model except the final round model -save_period = 0 -use_buffer = 0 - - -# The path of training data %d is the wildcard for the rank of the data -# The idea is each process take a feature matrix with subset of columns -# -data = "train.col%d" - -# The path of validation data, used to monitor training process, here [test] sets name of the validation set -eval[test] = "../data/agaricus.txt.test" -# evaluate on training data as well each round -eval_train = 1 - -# The path of test data, need to use full data of test, try not use it, or keep an subsampled version -test:data = "agaricus.txt.test" diff --git a/demo/mpi/runexp-mpi.sh b/demo/mpi/runexp-mpi.sh deleted file mode 100755 index cc0c6d459..000000000 --- a/demo/mpi/runexp-mpi.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -if [[ $# -ne 1 ]] -then - echo "Usage: nprocess" - exit -1 -fi - -rm -rf train.col* -k=$1 - -# split the lib svm file into k subfiles -python splitsvm.py ../data/agaricus.txt.train train $k - -# run xgboost mpi -mpirun -n $k ../../xgboost-mpi mpi.conf - -# the model can be directly loaded by single machine xgboost solver, as usuall -../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt -cat dump.nice.$k.txt diff --git a/demo/mpi/splitsvm.py b/demo/mpi/splitsvm.py deleted file mode 100644 index 365aef610..000000000 --- a/demo/mpi/splitsvm.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/python -import sys -import random - -# split libsvm file into different subcolumns -if len(sys.argv) < 4: - print ('Usage: k') - exit(0) - -random.seed(10) -fmap = {} - -k = int(sys.argv[3]) -fi = open( sys.argv[1], 'r' ) -fos = [] - -for i in range(k): - fos.append(open( sys.argv[2]+'.col%d' % i, 'w' )) - -for l in open(sys.argv[1]): - arr = l.split() - for f in fos: - f.write(arr[0]) - for it in arr[1:]: - fid = int(it.split(':')[0]) - if fid not in fmap: - fmap[fid] = random.randint(0, k-1) - fos[fmap[fid]].write(' '+it) - for f in fos: - f.write('\n') -for f in fos: - f.close() diff --git a/multi-node/README.md b/multi-node/README.md new file mode 100644 index 000000000..fab7472e7 --- /dev/null +++ b/multi-node/README.md @@ -0,0 +1,33 @@ +Distributed XGBoost +====== +This folder contains information about experimental version of distributed xgboost. + +Build +===== +* You will need to have MPI +* In the root folder, run ```make mpi```, this will give you xgboost-mpi + +Design Choice +===== +* Does distributed xgboost reply on MPI? + - Yes, but the dependency is isolated in [sync](../src/sync/sync.h) module + - Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce, + if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform +* How is the data distributed? + - There are two solvers in distributed xgboost + - Column-based solver split data by column, each node work on subset of columns, + it uses exactly the same algorithm as single node version. + - Row-based solver split data by row, each node work on subset of rows, + it uses an approximate histogram count algorithm, and will only examine subset of + potential split points as opposed to all split points. +* How to run the distributed version + - The current code run in MPI enviroment, you will need to have a network filesystem, + or copy data to local file system before running the code + - The distributed version is still multi-threading optimized. + You should run one xgboost-mpi per node that takes most available CPU, + this will reduce the communication overhead and improve the performance. + - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process. + +Examples +==== +* [Column-based version](col-split) diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 89bc28aec..b1a95dd96 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -32,6 +32,7 @@ class BoostLearner { silent= 0; prob_buffer_row = 1.0f; part_load_col = 0; + distributed_mode = 0; } ~BoostLearner(void) { if (obj_ != NULL) delete obj_; @@ -89,6 +90,17 @@ class BoostLearner { this->SetParam(n.c_str(), val); } if (!strcmp(name, "silent")) silent = atoi(val); + if (!strcmp(name, "dsplit")) { + if (!strcmp(val, "col")) { + this->SetParam("updater", "distcol,prune"); + distributed_mode = 1; + } else if (!strcmp(val, "row")) { + this->SetParam("updater", "grow_histmaker,prune"); + distributed_mode = 2; + } else { + utils::Error("%s is invalid value for dsplit, should be row or col", val); + } + } if (!strcmp(name, "part_load_col")) part_load_col = atoi(val); if (!strcmp(name, "prob_buffer_row")) { prob_buffer_row = static_cast(atof(val)); @@ -352,6 +364,8 @@ class BoostLearner { // data fields // silent during training int silent; + // distributed learning mode, if any, 0:none, 1:col, 2:row + int distributed_mode; // randomly load part of data int part_load_col; // maximum buffred row value diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp index ef3a9079c..1b596ebfb 100644 --- a/src/xgboost_main.cpp +++ b/src/xgboost_main.cpp @@ -32,7 +32,7 @@ class BoostLearnTask { } } if (sync::IsDistributed()) { - this->SetParam("updater", "distcol"); + this->SetParam("data_split", "col"); } if (sync::GetRank() != 0) { this->SetParam("silent", "2");