add keepalive

This commit is contained in:
tqchen
2014-12-03 13:21:30 -08:00
parent 2523288509
commit 7a983a4079
3 changed files with 15 additions and 3 deletions

12
test/keepalive.sh Executable file
View File

@@ -0,0 +1,12 @@
#!/bin/bash
if [ "$#" -lt 1 ];
then
echo "Usage: program parameters"
echo "Repeatively run program until success"
exit -1
fi
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK
until ./$@ job_id=$OMPI_COMM_WORLD_RANK; do
echo "Server "$1" crashed with exit code $?. Respawning.." >&2
sleep 1
done

View File

@@ -41,7 +41,7 @@ inline void TestMax(test::Mock &mock, Model *model, int ntrial, int iter) {
}
mock.Allreduce<op::Max>(&ndata[0], ndata.size());
if (ntrial == iter && rank == 3) {
throw MockException();
exit(-1);
}
for (size_t i = 0; i < ndata.size(); ++i) {
float rmax = (i * 1) % z + model->data[i];
@@ -65,7 +65,7 @@ inline void TestSum(test::Mock &mock, Model *model, int ntrial, int iter) {
mock.Allreduce<op::Sum>(&ndata[0], ndata.size());
if (ntrial == iter && rank == 0) {
throw MockException();
exit(-1);
}
for (size_t i = 0; i < ndata.size(); ++i) {