diff --git a/test/keepalive.sh b/test/keepalive.sh index 99bbb83f5..e72a2bba9 100755 --- a/test/keepalive.sh +++ b/test/keepalive.sh @@ -5,8 +5,10 @@ then echo "Repeatively run program until success" exit -1 fi +nrep=0 echo ./$@ job_id=$OMPI_COMM_WORLD_RANK -until ./$@ job_id=$OMPI_COMM_WORLD_RANK; do - echo "Server "$1" crashed with exit code $?. Respawning.." >&2 +until ./$@ job_id=$OMPI_COMM_WORLD_RANK repeat=$nrep; do sleep 1 + nrep=$((nrep+1)) + echo ./$@ job_id=$OMPI_COMM_WORLD_RANK repeat=$nrep done diff --git a/test/test_model_recover.cpp b/test/test_model_recover.cpp index 0d9f1bce7..86762c671 100644 --- a/test/test_model_recover.cpp +++ b/test/test_model_recover.cpp @@ -109,11 +109,16 @@ int main(int argc, char *argv[]) { Model model; srand(0); int ntrial = 0; + for (int i = 1; i < argc; ++i) { + int n; + if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n; + } while (true) { try { int iter = rabit::LoadCheckPoint(&model); if (iter == 0) { model.InitModel(n); + utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter); } else { utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter); }