[rabit harden] Enable all tests (#90)
* include osx in tests * address `time_wait` on port assignment * increase submit attempts. * cleanup tests
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
export MPICXX = mpicxx
|
||||
MPICXX=../mpich/bin/mpicxx
|
||||
export LDFLAGS= -L../lib -pthread -lm
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11
|
||||
|
||||
@@ -27,11 +27,17 @@ OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
|
||||
MPIBIN = speed_test.mpi
|
||||
.PHONY: clean all lib mpi
|
||||
|
||||
.PHONY: lib all
|
||||
|
||||
all: $(BIN)
|
||||
|
||||
lib:
|
||||
cd ..;make;cd -
|
||||
cd ..;make clean;make;cd -
|
||||
|
||||
.PHONY: mpi
|
||||
mpi:
|
||||
cd ..;make mpi;cd -
|
||||
|
||||
# programs
|
||||
speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
|
||||
model_recover.o: model_recover.cc ../include/rabit/*.h lib
|
||||
@@ -52,7 +58,8 @@ $(OBJ) :
|
||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||
|
||||
$(MPIBIN) :
|
||||
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mpi.so $(LDFLAGS)
|
||||
$(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \
|
||||
../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi
|
||||
|
||||
clean:
|
||||
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
|
||||
|
||||
@@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
|
||||
TestSum(&model, ntrial, r);
|
||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
||||
rabit::LazyCheckPoint(&model);
|
||||
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
|
||||
printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
|
||||
}
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
|
||||
@@ -130,7 +130,7 @@ int main(int argc, char *argv[]) {
|
||||
TestSum(&model, &local, ntrial, r);
|
||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
||||
rabit::CheckPoint(&model, &local);
|
||||
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
|
||||
printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
|
||||
}
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from __future__ import print_function
|
||||
from builtins import range
|
||||
|
||||
import sys
|
||||
sys.path.append('../python')
|
||||
|
||||
import rabit
|
||||
import numpy as np
|
||||
|
||||
|
||||
18
test/test.mk
18
test/test.mk
@@ -1,29 +1,29 @@
|
||||
# this is a makefile used to show testcases of rabit
|
||||
.PHONY: all
|
||||
|
||||
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k
|
||||
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k
|
||||
|
||||
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
||||
model_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||
|
||||
model_recover_10_10k_die_same:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
|
||||
model_recover_10_10k_die_hard:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
|
||||
local_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
|
||||
pylocal_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
|
||||
lazy_recover_10_10k_die_hard:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
|
||||
lazy_recover_10_10k_die_same:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
|
||||
ringallreduce_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10
|
||||
|
||||
Reference in New Issue
Block a user