add native script
This commit is contained in:
parent
1bcea65117
commit
bfb9aa3d77
@ -85,7 +85,8 @@ class AllreduceMock : public AllreduceRobust {
|
||||
inline void Verify(const MockKey &key, const char *name) {
|
||||
if (mock_map.count(key) != 0) {
|
||||
num_trial += 1;
|
||||
utils::Error("[%d]@@@Hit Mock Error:%s", rank, name);
|
||||
fprintf(stderr, "[%d]@@@Hit Mock Error:%s\n", rank, name);
|
||||
exit(-2);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
if [ "$#" -lt 1 ];
|
||||
then
|
||||
echo "Usage: program parameters"
|
||||
echo "Repeatively run program until success"
|
||||
exit -1
|
||||
fi
|
||||
nrep=0
|
||||
echo ./$@ rabit_task_id=$OMPI_COMM_WORLD_RANK
|
||||
until ./$@ rabit_task_id=$OMPI_COMM_WORLD_RANK rabit_num_trial=$nrep; do
|
||||
sleep 1
|
||||
nrep=$((nrep+1))
|
||||
echo ./$@ rabit_task_id=$OMPI_COMM_WORLD_RANK rabit_num_trial=$nrep
|
||||
done
|
||||
10
test/test.mk
10
test/test.mk
@ -10,17 +10,17 @@ endif
|
||||
|
||||
|
||||
local_recover:
|
||||
../tracker/rabit_mpi.py -n $(nslave) test_local_recover $(ndata) rabit_local_replica=1
|
||||
../tracker/rabit_demo.py -n $(nslave) test_local_recover $(ndata) rabit_local_replica=1
|
||||
|
||||
local_recover_10_10k:
|
||||
../tracker/rabit_mpi.py -n 10 test_local_recover 10000 rabit_local_replica=1
|
||||
../tracker/rabit_demo.py -n 10 test_local_recover 10000 rabit_local_replica=1
|
||||
|
||||
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
||||
model_recover_10_10k:
|
||||
../tracker/rabit_mpi.py -n 10 keepalive.sh test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||
|
||||
model_recover_10_10k_die_same:
|
||||
../tracker/rabit_mpi.py -n 10 keepalive.sh test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
|
||||
model_recover_10_10k_die_hard:
|
||||
../tracker/rabit_mpi.py -n 10 keepalive.sh test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
|
||||
57
tracker/rabit_demo.py
Executable file
57
tracker/rabit_demo.py
Executable file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/python
|
||||
"""
|
||||
This is the demo submission script of rabit, it is created to
|
||||
submit rabit jobs using hadoop streaming
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
from threading import Thread
|
||||
import rabit_tracker as tracker
|
||||
|
||||
parser = argparse.ArgumentParser(description='Rabit script to submit rabit job locally using python subprocess')
|
||||
parser.add_argument('-n', '--nworker', required=True, type=int,
|
||||
help = 'number of worker proccess to be launched')
|
||||
parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
|
||||
help = 'print more messages into the console')
|
||||
parser.add_argument('command', nargs='+',
|
||||
help = 'command for rabit program')
|
||||
args = parser.parse_args()
|
||||
|
||||
def exec_cmd(cmd, taskid):
|
||||
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]):
|
||||
cmd[0] = './' + cmd[0]
|
||||
cmd = ' '.join(cmd)
|
||||
ntrial = 0
|
||||
while True:
|
||||
arg = ' rabit_task_id=%d rabit_num_trial=%d' % (taskid, ntrial)
|
||||
ret = subprocess.call(cmd + arg, shell = True)
|
||||
if ret == 254 or ret == -2:
|
||||
ntrial += 1
|
||||
continue
|
||||
if ret == 0:
|
||||
return
|
||||
raise Exception('Get nonzero return code=%d' % ret)
|
||||
#
|
||||
# Note: this submit script is only used for demo purpose
|
||||
# submission script using pyhton multi-threading
|
||||
#
|
||||
def mthread_submit(nslave, slave_args):
|
||||
"""
|
||||
customized submit script, that submit nslave jobs, each must contain args as parameter
|
||||
note this can be a lambda function containing additional parameters in input
|
||||
Parameters
|
||||
nslave number of slave process to start up
|
||||
args arguments to launch each job
|
||||
this usually includes the parameters of master_uri and parameters passed into submit
|
||||
"""
|
||||
procs = {}
|
||||
for i in range(nslave):
|
||||
procs[i] = Thread(target = exec_cmd, args = (args.command + slave_args, i))
|
||||
procs[i].start()
|
||||
for i in range(nslave):
|
||||
procs[i].join()
|
||||
|
||||
# call submit, with nslave, the commands to run each job and submit function
|
||||
tracker.submit(args.nworker, [], fun_submit = mthread_submit, verbose = args.verbose)
|
||||
@ -20,9 +20,7 @@ parser.add_argument('command', nargs='+',
|
||||
help = 'command for rabit program')
|
||||
args = parser.parse_args()
|
||||
#
|
||||
# Note: this submit script is only used for demo purpose
|
||||
# It does not have to be mpirun, it can be any job submission
|
||||
# script that starts the job, qsub, hadoop streaming etc.
|
||||
# submission script using MPI
|
||||
#
|
||||
def mpi_submit(nslave, slave_args):
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user