ok
This commit is contained in:
parent
a7faac2f09
commit
6dbaddd2b9
@ -1,8 +1,7 @@
|
|||||||
# this is a makefile used to show testcases of rabit
|
# this is a makefile used to show testcases of rabit
|
||||||
.PHONY:
|
.PHONY:all
|
||||||
|
|
||||||
test:
|
all:
|
||||||
../tracker/rabit_demo.py -v 1 -n 10 test_model_recover 1 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=8,1,2,0
|
|
||||||
|
|
||||||
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
||||||
model_recover_10_10k:
|
model_recover_10_10k:
|
||||||
@ -14,3 +13,6 @@ model_recover_10_10k_die_same:
|
|||||||
model_recover_10_10k_die_hard:
|
model_recover_10_10k_die_hard:
|
||||||
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||||
|
|
||||||
|
|
||||||
|
local_recover_10_10k:
|
||||||
|
../tracker/rabit_demo.py -n 10 test_local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||||
|
|||||||
@ -20,21 +20,41 @@ parser.add_argument('command', nargs='+',
|
|||||||
help = 'command for rabit program')
|
help = 'command for rabit program')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# bash script for keepalive
|
||||||
|
# use it so that python do not need to communicate with subprocess
|
||||||
|
echo="echo %s rabit_num_trial=$nrep;"
|
||||||
|
keepalive = """
|
||||||
|
nrep=0
|
||||||
|
rc=254
|
||||||
|
while [ $rc -eq 254 ];
|
||||||
|
do
|
||||||
|
%s
|
||||||
|
%s %s rabit_num_trial=$nrep
|
||||||
|
rc=$?;
|
||||||
|
nrep=$((nrep+1));
|
||||||
|
done
|
||||||
|
"""
|
||||||
|
|
||||||
def exec_cmd(cmd, taskid):
|
def exec_cmd(cmd, taskid):
|
||||||
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]):
|
if cmd[0].find('/') == -1 and os.path.exists(cmd[0]):
|
||||||
cmd[0] = './' + cmd[0]
|
cmd[0] = './' + cmd[0]
|
||||||
cmd = ' '.join(cmd)
|
cmd = ' '.join(cmd)
|
||||||
|
arg = ' rabit_task_id=%d' % (taskid)
|
||||||
|
cmd = cmd + arg
|
||||||
ntrial = 0
|
ntrial = 0
|
||||||
while True:
|
while True:
|
||||||
prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH
|
prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH
|
||||||
arg = ' rabit_task_id=%d rabit_num_trial=%d' % (taskid, ntrial)
|
if args.verbose != 0:
|
||||||
ret = subprocess.call(prep + cmd + arg, shell = True)
|
bash = keepalive % (echo % cmd, prep, cmd)
|
||||||
if ret == 254 or ret == -2:
|
else:
|
||||||
ntrial += 1
|
bash = keepalive % ('', prep, cmd)
|
||||||
continue
|
ret = subprocess.call(bash, shell=True, executable='bash')
|
||||||
if ret == 0:
|
if ret == 0:
|
||||||
|
if args.verbose != 0:
|
||||||
|
print 'Thread %d exit with 0' % taskid
|
||||||
return
|
return
|
||||||
raise Exception('Get nonzero return code=%d' % ret)
|
else:
|
||||||
|
raise Exception('Get nonzero return code=%d' % ret)
|
||||||
#
|
#
|
||||||
# Note: this submit script is only used for demo purpose
|
# Note: this submit script is only used for demo purpose
|
||||||
# submission script using pyhton multi-threading
|
# submission script using pyhton multi-threading
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user