add keepalive script
This commit is contained in:
parent
7a983a4079
commit
90b9f1a98a
@ -5,8 +5,10 @@ then
|
||||
echo "Repeatively run program until success"
|
||||
exit -1
|
||||
fi
|
||||
nrep=0
|
||||
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK
|
||||
until ./$@ job_id=$OMPI_COMM_WORLD_RANK; do
|
||||
echo "Server "$1" crashed with exit code $?. Respawning.." >&2
|
||||
until ./$@ job_id=$OMPI_COMM_WORLD_RANK repeat=$nrep; do
|
||||
sleep 1
|
||||
nrep=$((nrep+1))
|
||||
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK repeat=$nrep
|
||||
done
|
||||
|
||||
@ -109,11 +109,16 @@ int main(int argc, char *argv[]) {
|
||||
Model model;
|
||||
srand(0);
|
||||
int ntrial = 0;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
int n;
|
||||
if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n;
|
||||
}
|
||||
while (true) {
|
||||
try {
|
||||
int iter = rabit::LoadCheckPoint(&model);
|
||||
if (iter == 0) {
|
||||
model.InitModel(n);
|
||||
utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
||||
} else {
|
||||
utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user