Merge branch 'master' of https://github.com/tqchen/allreduce
This commit is contained in:
commit
e3a95b2d1a
@ -5,8 +5,10 @@ then
|
|||||||
echo "Repeatively run program until success"
|
echo "Repeatively run program until success"
|
||||||
exit -1
|
exit -1
|
||||||
fi
|
fi
|
||||||
|
nrep=0
|
||||||
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK
|
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK
|
||||||
until ./$@ job_id=$OMPI_COMM_WORLD_RANK; do
|
until ./$@ job_id=$OMPI_COMM_WORLD_RANK repeat=$nrep; do
|
||||||
echo "Server "$1" crashed with exit code $?. Respawning.." >&2
|
|
||||||
sleep 1
|
sleep 1
|
||||||
|
nrep=$((nrep+1))
|
||||||
|
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK repeat=$nrep
|
||||||
done
|
done
|
||||||
|
|||||||
@ -109,11 +109,16 @@ int main(int argc, char *argv[]) {
|
|||||||
Model model;
|
Model model;
|
||||||
srand(0);
|
srand(0);
|
||||||
int ntrial = 0;
|
int ntrial = 0;
|
||||||
|
for (int i = 1; i < argc; ++i) {
|
||||||
|
int n;
|
||||||
|
if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n;
|
||||||
|
}
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
int iter = rabit::LoadCheckPoint(&model);
|
int iter = rabit::LoadCheckPoint(&model);
|
||||||
if (iter == 0) {
|
if (iter == 0) {
|
||||||
model.InitModel(n);
|
model.InitModel(n);
|
||||||
|
utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
||||||
} else {
|
} else {
|
||||||
utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
utils::LogPrintf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user