This commit is contained in:
tqchen 2015-01-14 22:11:00 -08:00
parent a7faac2f09
commit 6dbaddd2b9
2 changed files with 31 additions and 9 deletions

View File

@ -1,8 +1,7 @@
# this is a makefile used to show testcases of rabit # this is a makefile used to show testcases of rabit
.PHONY: .PHONY:all
test: all:
../tracker/rabit_demo.py -v 1 -n 10 test_model_recover 1 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=8,1,2,0
# this experiment test recovery with actually process exit, use keepalive to keep program alive # this experiment test recovery with actually process exit, use keepalive to keep program alive
model_recover_10_10k: model_recover_10_10k:
@ -14,3 +13,6 @@ model_recover_10_10k_die_same:
model_recover_10_10k_die_hard: model_recover_10_10k_die_hard:
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 ../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
local_recover_10_10k:
../tracker/rabit_demo.py -n 10 test_local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1

View File

@ -20,20 +20,40 @@ parser.add_argument('command', nargs='+',
help = 'command for rabit program') help = 'command for rabit program')
args = parser.parse_args() args = parser.parse_args()
# bash script for keepalive
# use it so that python do not need to communicate with subprocess
echo="echo %s rabit_num_trial=$nrep;"
keepalive = """
nrep=0
rc=254
while [ $rc -eq 254 ];
do
%s
%s %s rabit_num_trial=$nrep
rc=$?;
nrep=$((nrep+1));
done
"""
def exec_cmd(cmd, taskid): def exec_cmd(cmd, taskid):
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]): if cmd[0].find('/') == -1 and os.path.exists(cmd[0]):
cmd[0] = './' + cmd[0] cmd[0] = './' + cmd[0]
cmd = ' '.join(cmd) cmd = ' '.join(cmd)
arg = ' rabit_task_id=%d' % (taskid)
cmd = cmd + arg
ntrial = 0 ntrial = 0
while True: while True:
prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH
arg = ' rabit_task_id=%d rabit_num_trial=%d' % (taskid, ntrial) if args.verbose != 0:
ret = subprocess.call(prep + cmd + arg, shell = True) bash = keepalive % (echo % cmd, prep, cmd)
if ret == 254 or ret == -2: else:
ntrial += 1 bash = keepalive % ('', prep, cmd)
continue ret = subprocess.call(bash, shell=True, executable='bash')
if ret == 0: if ret == 0:
if args.verbose != 0:
print 'Thread %d exit with 0' % taskid
return return
else:
raise Exception('Get nonzero return code=%d' % ret) raise Exception('Get nonzero return code=%d' % ret)
# #
# Note: this submit script is only used for demo purpose # Note: this submit script is only used for demo purpose