add keepalive
This commit is contained in:
parent
2523288509
commit
7a983a4079
@ -58,7 +58,7 @@ class SlaveEntry:
|
|||||||
if self.rank >= 0:
|
if self.rank >= 0:
|
||||||
return self.rank
|
return self.rank
|
||||||
if self.jobid != 'NULL' and self.jobid in job_map:
|
if self.jobid != 'NULL' and self.jobid in job_map:
|
||||||
job_map[self.jobid]
|
return job_map[self.jobid]
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
def get_neighbor(self, rank, nslave):
|
def get_neighbor(self, rank, nslave):
|
||||||
|
|||||||
12
test/keepalive.sh
Executable file
12
test/keepalive.sh
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
if [ "$#" -lt 1 ];
|
||||||
|
then
|
||||||
|
echo "Usage: program parameters"
|
||||||
|
echo "Repeatively run program until success"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
echo ./$@ job_id=$OMPI_COMM_WORLD_RANK
|
||||||
|
until ./$@ job_id=$OMPI_COMM_WORLD_RANK; do
|
||||||
|
echo "Server "$1" crashed with exit code $?. Respawning.." >&2
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
@ -41,7 +41,7 @@ inline void TestMax(test::Mock &mock, Model *model, int ntrial, int iter) {
|
|||||||
}
|
}
|
||||||
mock.Allreduce<op::Max>(&ndata[0], ndata.size());
|
mock.Allreduce<op::Max>(&ndata[0], ndata.size());
|
||||||
if (ntrial == iter && rank == 3) {
|
if (ntrial == iter && rank == 3) {
|
||||||
throw MockException();
|
exit(-1);
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
for (size_t i = 0; i < ndata.size(); ++i) {
|
||||||
float rmax = (i * 1) % z + model->data[i];
|
float rmax = (i * 1) % z + model->data[i];
|
||||||
@ -65,7 +65,7 @@ inline void TestSum(test::Mock &mock, Model *model, int ntrial, int iter) {
|
|||||||
mock.Allreduce<op::Sum>(&ndata[0], ndata.size());
|
mock.Allreduce<op::Sum>(&ndata[0], ndata.size());
|
||||||
|
|
||||||
if (ntrial == iter && rank == 0) {
|
if (ntrial == iter && rank == 0) {
|
||||||
throw MockException();
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
for (size_t i = 0; i < ndata.size(); ++i) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user