recheck column mode

This commit is contained in:
tqchen 2014-11-19 11:21:07 -08:00
parent dffcbc838b
commit 54e2ed90d7
8 changed files with 51 additions and 93 deletions

View File

@ -17,9 +17,10 @@ MPIOBJ = sync_mpi.o
MPIBIN = xgboost-mpi
SLIB = wrapper/libxgboostwrapper.so
.PHONY: clean all python Rpack
.PHONY: clean all mpi python Rpack
all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN)
all: $(BIN) $(OBJ) $(SLIB)
mpi: $(MPIBIN)
python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part

View File

@ -1,3 +0,0 @@
This folder contains toy example script to run xgboost-mpi.
This is an experimental distributed version of xgboost

View File

@ -1,36 +0,0 @@
# General Parameters, see comment for each definition
# choose the booster, can be gbtree or gblinear
booster = gbtree
# choose logistic regression loss function for binary classification
objective = binary:logistic
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
use_buffer = 0
# The path of training data %d is the wildcard for the rank of the data
# The idea is each process take a feature matrix with subset of columns
#
data = "train.col%d"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "../data/agaricus.txt.test"
# evaluate on training data as well each round
eval_train = 1
# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
test:data = "agaricus.txt.test"

View File

@ -1,19 +0,0 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf train.col*
k=$1
# split the lib svm file into k subfiles
python splitsvm.py ../data/agaricus.txt.train train $k
# run xgboost mpi
mpirun -n $k ../../xgboost-mpi mpi.conf
# the model can be directly loaded by single machine xgboost solver, as usuall
../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
cat dump.nice.$k.txt

View File

@ -1,32 +0,0 @@
#!/usr/bin/python
import sys
import random
# split libsvm file into different subcolumns
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)
random.seed(10)
fmap = {}
k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []
for i in range(k):
fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
for l in open(sys.argv[1]):
arr = l.split()
for f in fos:
f.write(arr[0])
for it in arr[1:]:
fid = int(it.split(':')[0])
if fid not in fmap:
fmap[fid] = random.randint(0, k-1)
fos[fmap[fid]].write(' '+it)
for f in fos:
f.write('\n')
for f in fos:
f.close()

33
multi-node/README.md Normal file
View File

@ -0,0 +1,33 @@
Distributed XGBoost
======
This folder contains information about experimental version of distributed xgboost.
Build
=====
* You will need to have MPI
* In the root folder, run ```make mpi```, this will give you xgboost-mpi
Design Choice
=====
* Does distributed xgboost reply on MPI?
- Yes, but the dependency is isolated in [sync](../src/sync/sync.h) module
- Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
* How is the data distributed?
- There are two solvers in distributed xgboost
- Column-based solver split data by column, each node work on subset of columns,
it uses exactly the same algorithm as single node version.
- Row-based solver split data by row, each node work on subset of rows,
it uses an approximate histogram count algorithm, and will only examine subset of
potential split points as opposed to all split points.
* How to run the distributed version
- The current code run in MPI enviroment, you will need to have a network filesystem,
or copy data to local file system before running the code
- The distributed version is still multi-threading optimized.
You should run one xgboost-mpi per node that takes most available CPU,
this will reduce the communication overhead and improve the performance.
- One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
Examples
====
* [Column-based version](col-split)

View File

@ -32,6 +32,7 @@ class BoostLearner {
silent= 0;
prob_buffer_row = 1.0f;
part_load_col = 0;
distributed_mode = 0;
}
~BoostLearner(void) {
if (obj_ != NULL) delete obj_;
@ -89,6 +90,17 @@ class BoostLearner {
this->SetParam(n.c_str(), val);
}
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "dsplit")) {
if (!strcmp(val, "col")) {
this->SetParam("updater", "distcol,prune");
distributed_mode = 1;
} else if (!strcmp(val, "row")) {
this->SetParam("updater", "grow_histmaker,prune");
distributed_mode = 2;
} else {
utils::Error("%s is invalid value for dsplit, should be row or col", val);
}
}
if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
if (!strcmp(name, "prob_buffer_row")) {
prob_buffer_row = static_cast<float>(atof(val));
@ -352,6 +364,8 @@ class BoostLearner {
// data fields
// silent during training
int silent;
// distributed learning mode, if any, 0:none, 1:col, 2:row
int distributed_mode;
// randomly load part of data
int part_load_col;
// maximum buffred row value

View File

@ -32,7 +32,7 @@ class BoostLearnTask {
}
}
if (sync::IsDistributed()) {
this->SetParam("updater", "distcol");
this->SetParam("data_split", "col");
}
if (sync::GetRank() != 0) {
this->SetParam("silent", "2");