From 03e24cf59088326e4ebd4a9f149fa761ae222ccd Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 19 Nov 2014 11:22:17 -0800 Subject: [PATCH] check multinode --- multi-node/col-split/README.md | 2 ++ multi-node/col-split/runexp-mpi.sh | 19 ++++++++++++++++++ multi-node/col-split/splitsvm.py | 32 ++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) create mode 100644 multi-node/col-split/README.md create mode 100755 multi-node/col-split/runexp-mpi.sh create mode 100644 multi-node/col-split/splitsvm.py diff --git a/multi-node/col-split/README.md b/multi-node/col-split/README.md new file mode 100644 index 000000000..14fe993d2 --- /dev/null +++ b/multi-node/col-split/README.md @@ -0,0 +1,2 @@ +Column Split Version of XGBoost +==== diff --git a/multi-node/col-split/runexp-mpi.sh b/multi-node/col-split/runexp-mpi.sh new file mode 100755 index 000000000..d5469e714 --- /dev/null +++ b/multi-node/col-split/runexp-mpi.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [[ $# -ne 1 ]] +then + echo "Usage: nprocess" + exit -1 +fi + +rm -rf train.col* +k=$1 + +# split the lib svm file into k subfiles +python splitsvm.py ../../demo/data/agaricus.txt.train train $k + +# run xgboost mpi +mpirun -n $k ../../xgboost-mpi mushroom-col.conf dsplit=col + +# the model can be directly loaded by single machine xgboost solver, as usuall +../../xgboost mpi.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt +cat dump.nice.$k.txt diff --git a/multi-node/col-split/splitsvm.py b/multi-node/col-split/splitsvm.py new file mode 100644 index 000000000..365aef610 --- /dev/null +++ b/multi-node/col-split/splitsvm.py @@ -0,0 +1,32 @@ +#!/usr/bin/python +import sys +import random + +# split libsvm file into different subcolumns +if len(sys.argv) < 4: + print ('Usage: k') + exit(0) + +random.seed(10) +fmap = {} + +k = int(sys.argv[3]) +fi = open( sys.argv[1], 'r' ) +fos = [] + +for i in range(k): + fos.append(open( sys.argv[2]+'.col%d' % i, 'w' )) + +for l in open(sys.argv[1]): + arr = l.split() + for f in fos: + f.write(arr[0]) + for it in arr[1:]: + fid = int(it.split(':')[0]) + if fid not in fmap: + fmap[fid] = random.randint(0, k-1) + fos[fmap[fid]].write(' '+it) + for f in fos: + f.write('\n') +for f in fos: + f.close()