From 6dbaddd2b902c2b88ec097b3a93563dff4c9f51e Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 14 Jan 2015 22:11:00 -0800 Subject: [PATCH] ok --- test/test.mk | 8 +++++--- tracker/rabit_demo.py | 32 ++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/test/test.mk b/test/test.mk index 4e0d2b763..99d146d08 100644 --- a/test/test.mk +++ b/test/test.mk @@ -1,8 +1,7 @@ # this is a makefile used to show testcases of rabit -.PHONY: +.PHONY:all -test: - ../tracker/rabit_demo.py -v 1 -n 10 test_model_recover 1 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=8,1,2,0 +all: # this experiment test recovery with actually process exit, use keepalive to keep program alive model_recover_10_10k: @@ -14,3 +13,6 @@ model_recover_10_10k_die_same: model_recover_10_10k_die_hard: ../tracker/rabit_demo.py -n 10 test_model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 + +local_recover_10_10k: + ../tracker/rabit_demo.py -n 10 test_local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 diff --git a/tracker/rabit_demo.py b/tracker/rabit_demo.py index e4592a029..b99d03c6b 100755 --- a/tracker/rabit_demo.py +++ b/tracker/rabit_demo.py @@ -20,21 +20,41 @@ parser.add_argument('command', nargs='+', help = 'command for rabit program') args = parser.parse_args() +# bash script for keepalive +# use it so that python do not need to communicate with subprocess +echo="echo %s rabit_num_trial=$nrep;" +keepalive = """ +nrep=0 +rc=254 +while [ $rc -eq 254 ]; +do + %s + %s %s rabit_num_trial=$nrep + rc=$?; + nrep=$((nrep+1)); +done +""" + def exec_cmd(cmd, taskid): if cmd[0].find('/') == -1 and os.path.exists(cmd[0]): cmd[0] = './' + cmd[0] cmd = ' '.join(cmd) + arg = ' rabit_task_id=%d' % (taskid) + cmd = cmd + arg ntrial = 0 while True: prep = 'PYTHONPATH=\"%s\" ' % WRAPPER_PATH - arg = ' rabit_task_id=%d rabit_num_trial=%d' % (taskid, ntrial) - ret = subprocess.call(prep + cmd + arg, shell = True) - if ret == 254 or ret == -2: - ntrial += 1 - continue + if args.verbose != 0: + bash = keepalive % (echo % cmd, prep, cmd) + else: + bash = keepalive % ('', prep, cmd) + ret = subprocess.call(bash, shell=True, executable='bash') if ret == 0: + if args.verbose != 0: + print 'Thread %d exit with 0' % taskid return - raise Exception('Get nonzero return code=%d' % ret) + else: + raise Exception('Get nonzero return code=%d' % ret) # # Note: this submit script is only used for demo purpose # submission script using pyhton multi-threading