From 7a983a4079632d1200c0de45c6237f3f9a463a3f Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 3 Dec 2014 13:21:30 -0800 Subject: [PATCH] add keepalive --- src/rabit_master.py | 2 +- test/keepalive.sh | 12 ++++++++++++ test/test_model_recover.cpp | 4 ++-- 3 files changed, 15 insertions(+), 3 deletions(-) create mode 100755 test/keepalive.sh diff --git a/src/rabit_master.py b/src/rabit_master.py index cfa1cce9a..dbe303e39 100644 --- a/src/rabit_master.py +++ b/src/rabit_master.py @@ -58,7 +58,7 @@ class SlaveEntry: if self.rank >= 0: return self.rank if self.jobid != 'NULL' and self.jobid in job_map: - job_map[self.jobid] + return job_map[self.jobid] return -1 def get_neighbor(self, rank, nslave): diff --git a/test/keepalive.sh b/test/keepalive.sh new file mode 100755 index 000000000..99bbb83f5 --- /dev/null +++ b/test/keepalive.sh @@ -0,0 +1,12 @@ +#!/bin/bash +if [ "$#" -lt 1 ]; +then + echo "Usage: program parameters" + echo "Repeatively run program until success" + exit -1 +fi +echo ./$@ job_id=$OMPI_COMM_WORLD_RANK +until ./$@ job_id=$OMPI_COMM_WORLD_RANK; do + echo "Server "$1" crashed with exit code $?. Respawning.." >&2 + sleep 1 +done diff --git a/test/test_model_recover.cpp b/test/test_model_recover.cpp index c482c266c..0d9f1bce7 100644 --- a/test/test_model_recover.cpp +++ b/test/test_model_recover.cpp @@ -41,7 +41,7 @@ inline void TestMax(test::Mock &mock, Model *model, int ntrial, int iter) { } mock.Allreduce(&ndata[0], ndata.size()); if (ntrial == iter && rank == 3) { - throw MockException(); + exit(-1); } for (size_t i = 0; i < ndata.size(); ++i) { float rmax = (i * 1) % z + model->data[i]; @@ -65,7 +65,7 @@ inline void TestSum(test::Mock &mock, Model *model, int ntrial, int iter) { mock.Allreduce(&ndata[0], ndata.size()); if (ntrial == iter && rank == 0) { - throw MockException(); + exit(-1); } for (size_t i = 0; i < ndata.size(); ++i) {