add keepalive script

This commit is contained in:
tqchen 2014-12-03 15:04:30 -08:00
parent 7a983a4079
commit 90b9f1a98a
2 changed files with 9 additions and 2 deletions

View File

@ -5,8 +5,10 @@ then
echo "Repeatively run program until success"
exit -1
fi
nrep=0
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK
until ./$@ job_id=$OMPI_COMM_WORLD_RANK; do
echo "Server "$1" crashed with exit code $?. Respawning.." >&2
until ./$@ job_id=$OMPI_COMM_WORLD_RANK repeat=$nrep; do
sleep 1
nrep=$((nrep+1))
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK repeat=$nrep
done

View File

@ -109,11 +109,16 @@ int main(int argc, char *argv[]) {
Model model;
srand(0);
int ntrial = 0;
for (int i = 1; i < argc; ++i) {
int n;
if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n;
}
while (true) {
try {
int iter = rabit::LoadCheckPoint(&model);
if (iter == 0) {
model.InitModel(n);
utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
} else {
utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
}