recheck column mode
This commit is contained in:
parent
dffcbc838b
commit
54e2ed90d7
5
Makefile
5
Makefile
@ -17,9 +17,10 @@ MPIOBJ = sync_mpi.o
|
|||||||
MPIBIN = xgboost-mpi
|
MPIBIN = xgboost-mpi
|
||||||
SLIB = wrapper/libxgboostwrapper.so
|
SLIB = wrapper/libxgboostwrapper.so
|
||||||
|
|
||||||
.PHONY: clean all python Rpack
|
.PHONY: clean all mpi python Rpack
|
||||||
|
|
||||||
all: $(BIN) $(OBJ) $(SLIB) $(MPIOBJ) $(MPIBIN)
|
all: $(BIN) $(OBJ) $(SLIB)
|
||||||
|
mpi: $(MPIBIN)
|
||||||
|
|
||||||
python: wrapper/libxgboostwrapper.so
|
python: wrapper/libxgboostwrapper.so
|
||||||
# now the wrapper takes in two files. io and wrapper part
|
# now the wrapper takes in two files. io and wrapper part
|
||||||
|
|||||||
@ -1,3 +0,0 @@
|
|||||||
This folder contains toy example script to run xgboost-mpi.
|
|
||||||
|
|
||||||
This is an experimental distributed version of xgboost
|
|
||||||
@ -1,36 +0,0 @@
|
|||||||
# General Parameters, see comment for each definition
|
|
||||||
# choose the booster, can be gbtree or gblinear
|
|
||||||
booster = gbtree
|
|
||||||
# choose logistic regression loss function for binary classification
|
|
||||||
objective = binary:logistic
|
|
||||||
|
|
||||||
# Tree Booster Parameters
|
|
||||||
# step size shrinkage
|
|
||||||
eta = 1.0
|
|
||||||
# minimum loss reduction required to make a further partition
|
|
||||||
gamma = 1.0
|
|
||||||
# minimum sum of instance weight(hessian) needed in a child
|
|
||||||
min_child_weight = 1
|
|
||||||
# maximum depth of a tree
|
|
||||||
max_depth = 3
|
|
||||||
|
|
||||||
# Task Parameters
|
|
||||||
# the number of round to do boosting
|
|
||||||
num_round = 2
|
|
||||||
# 0 means do not save any model except the final round model
|
|
||||||
save_period = 0
|
|
||||||
use_buffer = 0
|
|
||||||
|
|
||||||
|
|
||||||
# The path of training data %d is the wildcard for the rank of the data
|
|
||||||
# The idea is each process take a feature matrix with subset of columns
|
|
||||||
#
|
|
||||||
data = "train.col%d"
|
|
||||||
|
|
||||||
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
|
||||||
eval[test] = "../data/agaricus.txt.test"
|
|
||||||
# evaluate on training data as well each round
|
|
||||||
eval_train = 1
|
|
||||||
|
|
||||||
# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
|
|
||||||
test:data = "agaricus.txt.test"
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
if [[ $# -ne 1 ]]
|
|
||||||
then
|
|
||||||
echo "Usage: nprocess"
|
|
||||||
exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
rm -rf train.col*
|
|
||||||
k=$1
|
|
||||||
|
|
||||||
# split the lib svm file into k subfiles
|
|
||||||
python splitsvm.py ../data/agaricus.txt.train train $k
|
|
||||||
|
|
||||||
# run xgboost mpi
|
|
||||||
mpirun -n $k ../../xgboost-mpi mpi.conf
|
|
||||||
|
|
||||||
# the model can be directly loaded by single machine xgboost solver, as usuall
|
|
||||||
../../xgboost mpi.conf task=dump model_in=0002.model fmap=../data/featmap.txt name_dump=dump.nice.$k.txt
|
|
||||||
cat dump.nice.$k.txt
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
import sys
|
|
||||||
import random
|
|
||||||
|
|
||||||
# split libsvm file into different subcolumns
|
|
||||||
if len(sys.argv) < 4:
|
|
||||||
print ('Usage:<fin> <fo> k')
|
|
||||||
exit(0)
|
|
||||||
|
|
||||||
random.seed(10)
|
|
||||||
fmap = {}
|
|
||||||
|
|
||||||
k = int(sys.argv[3])
|
|
||||||
fi = open( sys.argv[1], 'r' )
|
|
||||||
fos = []
|
|
||||||
|
|
||||||
for i in range(k):
|
|
||||||
fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
|
|
||||||
|
|
||||||
for l in open(sys.argv[1]):
|
|
||||||
arr = l.split()
|
|
||||||
for f in fos:
|
|
||||||
f.write(arr[0])
|
|
||||||
for it in arr[1:]:
|
|
||||||
fid = int(it.split(':')[0])
|
|
||||||
if fid not in fmap:
|
|
||||||
fmap[fid] = random.randint(0, k-1)
|
|
||||||
fos[fmap[fid]].write(' '+it)
|
|
||||||
for f in fos:
|
|
||||||
f.write('\n')
|
|
||||||
for f in fos:
|
|
||||||
f.close()
|
|
||||||
33
multi-node/README.md
Normal file
33
multi-node/README.md
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
Distributed XGBoost
|
||||||
|
======
|
||||||
|
This folder contains information about experimental version of distributed xgboost.
|
||||||
|
|
||||||
|
Build
|
||||||
|
=====
|
||||||
|
* You will need to have MPI
|
||||||
|
* In the root folder, run ```make mpi```, this will give you xgboost-mpi
|
||||||
|
|
||||||
|
Design Choice
|
||||||
|
=====
|
||||||
|
* Does distributed xgboost reply on MPI?
|
||||||
|
- Yes, but the dependency is isolated in [sync](../src/sync/sync.h) module
|
||||||
|
- Specificially, xgboost reply on MPI protocol that provide Broadcast and AllReduce,
|
||||||
|
if there are platform/framework that implements these protocol, xgboost should naturally extends to these platform
|
||||||
|
* How is the data distributed?
|
||||||
|
- There are two solvers in distributed xgboost
|
||||||
|
- Column-based solver split data by column, each node work on subset of columns,
|
||||||
|
it uses exactly the same algorithm as single node version.
|
||||||
|
- Row-based solver split data by row, each node work on subset of rows,
|
||||||
|
it uses an approximate histogram count algorithm, and will only examine subset of
|
||||||
|
potential split points as opposed to all split points.
|
||||||
|
* How to run the distributed version
|
||||||
|
- The current code run in MPI enviroment, you will need to have a network filesystem,
|
||||||
|
or copy data to local file system before running the code
|
||||||
|
- The distributed version is still multi-threading optimized.
|
||||||
|
You should run one xgboost-mpi per node that takes most available CPU,
|
||||||
|
this will reduce the communication overhead and improve the performance.
|
||||||
|
- One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
====
|
||||||
|
* [Column-based version](col-split)
|
||||||
@ -32,6 +32,7 @@ class BoostLearner {
|
|||||||
silent= 0;
|
silent= 0;
|
||||||
prob_buffer_row = 1.0f;
|
prob_buffer_row = 1.0f;
|
||||||
part_load_col = 0;
|
part_load_col = 0;
|
||||||
|
distributed_mode = 0;
|
||||||
}
|
}
|
||||||
~BoostLearner(void) {
|
~BoostLearner(void) {
|
||||||
if (obj_ != NULL) delete obj_;
|
if (obj_ != NULL) delete obj_;
|
||||||
@ -89,6 +90,17 @@ class BoostLearner {
|
|||||||
this->SetParam(n.c_str(), val);
|
this->SetParam(n.c_str(), val);
|
||||||
}
|
}
|
||||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||||
|
if (!strcmp(name, "dsplit")) {
|
||||||
|
if (!strcmp(val, "col")) {
|
||||||
|
this->SetParam("updater", "distcol,prune");
|
||||||
|
distributed_mode = 1;
|
||||||
|
} else if (!strcmp(val, "row")) {
|
||||||
|
this->SetParam("updater", "grow_histmaker,prune");
|
||||||
|
distributed_mode = 2;
|
||||||
|
} else {
|
||||||
|
utils::Error("%s is invalid value for dsplit, should be row or col", val);
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
|
if (!strcmp(name, "part_load_col")) part_load_col = atoi(val);
|
||||||
if (!strcmp(name, "prob_buffer_row")) {
|
if (!strcmp(name, "prob_buffer_row")) {
|
||||||
prob_buffer_row = static_cast<float>(atof(val));
|
prob_buffer_row = static_cast<float>(atof(val));
|
||||||
@ -352,6 +364,8 @@ class BoostLearner {
|
|||||||
// data fields
|
// data fields
|
||||||
// silent during training
|
// silent during training
|
||||||
int silent;
|
int silent;
|
||||||
|
// distributed learning mode, if any, 0:none, 1:col, 2:row
|
||||||
|
int distributed_mode;
|
||||||
// randomly load part of data
|
// randomly load part of data
|
||||||
int part_load_col;
|
int part_load_col;
|
||||||
// maximum buffred row value
|
// maximum buffred row value
|
||||||
|
|||||||
@ -32,7 +32,7 @@ class BoostLearnTask {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (sync::IsDistributed()) {
|
if (sync::IsDistributed()) {
|
||||||
this->SetParam("updater", "distcol");
|
this->SetParam("data_split", "col");
|
||||||
}
|
}
|
||||||
if (sync::GetRank() != 0) {
|
if (sync::GetRank() != 0) {
|
||||||
this->SetParam("silent", "2");
|
this->SetParam("silent", "2");
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user