add keepalive

This commit is contained in:
tqchen 2014-12-03 13:21:30 -08:00
parent 2523288509
commit 7a983a4079
3 changed files with 15 additions and 3 deletions

View File

@ -58,7 +58,7 @@ class SlaveEntry:
if self.rank >= 0: if self.rank >= 0:
return self.rank return self.rank
if self.jobid != 'NULL' and self.jobid in job_map: if self.jobid != 'NULL' and self.jobid in job_map:
job_map[self.jobid] return job_map[self.jobid]
return -1 return -1
def get_neighbor(self, rank, nslave): def get_neighbor(self, rank, nslave):

12
test/keepalive.sh Executable file
View File

@ -0,0 +1,12 @@
#!/bin/bash
if [ "$#" -lt 1 ];
then
echo "Usage: program parameters"
echo "Repeatively run program until success"
exit -1
fi
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK
until ./$@ job_id=$OMPI_COMM_WORLD_RANK; do
echo "Server "$1" crashed with exit code $?. Respawning.." >&2
sleep 1
done

View File

@ -41,7 +41,7 @@ inline void TestMax(test::Mock &mock, Model *model, int ntrial, int iter) {
} }
mock.Allreduce<op::Max>(&ndata[0], ndata.size()); mock.Allreduce<op::Max>(&ndata[0], ndata.size());
if (ntrial == iter && rank == 3) { if (ntrial == iter && rank == 3) {
throw MockException(); exit(-1);
} }
for (size_t i = 0; i < ndata.size(); ++i) { for (size_t i = 0; i < ndata.size(); ++i) {
float rmax = (i * 1) % z + model->data[i]; float rmax = (i * 1) % z + model->data[i];
@ -65,7 +65,7 @@ inline void TestSum(test::Mock &mock, Model *model, int ntrial, int iter) {
mock.Allreduce<op::Sum>(&ndata[0], ndata.size()); mock.Allreduce<op::Sum>(&ndata[0], ndata.size());
if (ntrial == iter && rank == 0) { if (ntrial == iter && rank == 0) {
throw MockException(); exit(-1);
} }
for (size_t i = 0; i < ndata.size(); ++i) { for (size_t i = 0; i < ndata.size(); ++i) {