add native script

This commit is contained in:
tqchen 2014-12-30 04:37:50 -08:00
parent 1bcea65117
commit bfb9aa3d77
5 changed files with 65 additions and 23 deletions

View File

@ -85,7 +85,8 @@ class AllreduceMock : public AllreduceRobust {
inline void Verify(const MockKey &key, const char *name) {
if (mock_map.count(key) != 0) {
num_trial += 1;
utils::Error("[%d]@@@Hit Mock Error:%s", rank, name);
fprintf(stderr, "[%d]@@@Hit Mock Error:%s\n", rank, name);
exit(-2);
}
}
};

View File

@ -1,14 +0,0 @@
#!/bin/bash
if [ "$#" -lt 1 ];
then
echo "Usage: program parameters"
echo "Repeatively run program until success"
exit -1
fi
nrep=0
echo ./$@ rabit_task_id=$OMPI_COMM_WORLD_RANK
until ./$@ rabit_task_id=$OMPI_COMM_WORLD_RANK rabit_num_trial=$nrep; do
sleep 1
nrep=$((nrep+1))
echo ./$@ rabit_task_id=$OMPI_COMM_WORLD_RANK rabit_num_trial=$nrep
done

View File

@ -10,17 +10,17 @@ endif
local_recover:
../tracker/rabit_mpi.py -n $(nslave) test_local_recover $(ndata) rabit_local_replica=1
../tracker/rabit_demo.py -n $(nslave) test_local_recover $(ndata) rabit_local_replica=1
local_recover_10_10k:
../tracker/rabit_mpi.py -n 10 test_local_recover 10000 rabit_local_replica=1
../tracker/rabit_demo.py -n 10 test_local_recover 10000 rabit_local_replica=1
# this experiment test recovery with actually process exit, use keepalive to keep program alive
model_recover_10_10k:
../tracker/rabit_mpi.py -n 10 keepalive.sh test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
model_recover_10_10k_die_same:
../tracker/rabit_mpi.py -n 10 keepalive.sh test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
model_recover_10_10k_die_hard:
../tracker/rabit_mpi.py -n 10 keepalive.sh test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0

57
tracker/rabit_demo.py Executable file
View File

@ -0,0 +1,57 @@
#!/usr/bin/python
"""
This is the demo submission script of rabit, it is created to
submit rabit jobs using hadoop streaming
"""
import argparse
import sys
import os
import subprocess
from threading import Thread
import rabit_tracker as tracker
parser = argparse.ArgumentParser(description='Rabit script to submit rabit job locally using python subprocess')
parser.add_argument('-n', '--nworker', required=True, type=int,
help = 'number of worker proccess to be launched')
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
help = 'print more messages into the console')
parser.add_argument('command', nargs='+',
help = 'command for rabit program')
args = parser.parse_args()
def exec_cmd(cmd, taskid):
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]):
cmd[0] = './' + cmd[0]
cmd = ' '.join(cmd)
ntrial = 0
while True:
arg = ' rabit_task_id=%d rabit_num_trial=%d' % (taskid, ntrial)
ret = subprocess.call(cmd + arg, shell = True)
if ret == 254 or ret == -2:
ntrial += 1
continue
if ret == 0:
return
raise Exception('Get nonzero return code=%d' % ret)
#
# Note: this submit script is only used for demo purpose
# submission script using pyhton multi-threading
#
def mthread_submit(nslave, slave_args):
"""
customized submit script, that submit nslave jobs, each must contain args as parameter
note this can be a lambda function containing additional parameters in input
Parameters
nslave number of slave process to start up
args arguments to launch each job
this usually includes the parameters of master_uri and parameters passed into submit
"""
procs = {}
for i in range(nslave):
procs[i] = Thread(target = exec_cmd, args = (args.command + slave_args, i))
procs[i].start()
for i in range(nslave):
procs[i].join()
# call submit, with nslave, the commands to run each job and submit function
tracker.submit(args.nworker, [], fun_submit = mthread_submit, verbose = args.verbose)

View File

@ -20,9 +20,7 @@ parser.add_argument('command', nargs='+',
help = 'command for rabit program')
args = parser.parse_args()
#
# Note: this submit script is only used for demo purpose
# It does not have to be mpirun, it can be any job submission
# script that starts the job, qsub, hadoop streaming etc.
# submission script using MPI
#
def mpi_submit(nslave, slave_args):
"""