[rabit harden] fix rabit tests (#81)
* enable model recovery tests * force use gcc4.8 in Travis
This commit is contained in:
@@ -1,8 +1,25 @@
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
export MPICXX = mpicxx
|
||||
export LDFLAGS= -L../lib -pthread -lm -lrt
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++0x
|
||||
export LDFLAGS= -L../lib -pthread -lm
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++11
|
||||
|
||||
OS := $(shell uname)
|
||||
|
||||
ifeq ($(OS), Darwin)
|
||||
ifndef CC
|
||||
export CC = $(if $(shell which clang), clang, gcc)
|
||||
endif
|
||||
ifndef CXX
|
||||
export CXX = $(if $(shell which clang++), clang++, g++)
|
||||
endif
|
||||
else
|
||||
ifndef CC
|
||||
export CC = gcc
|
||||
endif
|
||||
ifndef CXX
|
||||
export CXX = g++
|
||||
endif
|
||||
LDFLAGS += -lrt
|
||||
endif
|
||||
|
||||
# specify tensor path
|
||||
BIN = speed_test model_recover local_recover lazy_recover
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
|
||||
using namespace rabit;
|
||||
|
||||
// dummy model
|
||||
@@ -77,10 +78,9 @@ inline void TestBcast(size_t n, int root, int ntrial, int iter) {
|
||||
std::string res;
|
||||
if (root == rank) {
|
||||
res = s;
|
||||
rabit::Broadcast(&res, root);
|
||||
} else {
|
||||
rabit::Broadcast(&res, root);
|
||||
}
|
||||
rabit::Broadcast(&res, root);
|
||||
|
||||
utils::Check(res == s, "[%d] TestBcast fail", rank);
|
||||
}
|
||||
|
||||
@@ -104,10 +104,9 @@ int main(int argc, char *argv[]) {
|
||||
int iter = rabit::LoadCheckPoint(&model);
|
||||
if (iter == 0) {
|
||||
model.InitModel(n);
|
||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
||||
} else {
|
||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
||||
}
|
||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
||||
|
||||
for (int r = iter; r < 3; ++r) {
|
||||
TestMax(&model, ntrial, r);
|
||||
printf("[%d] !!!TestMax pass, iter=%d\n", rank, r);
|
||||
@@ -119,7 +118,7 @@ int main(int argc, char *argv[]) {
|
||||
TestSum(&model, ntrial, r);
|
||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
||||
rabit::CheckPoint(&model);
|
||||
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
|
||||
printf("[%d] !!!Checkpoint pass, iter=%d\n", rank, r);
|
||||
}
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
|
||||
15
test/test.mk
15
test/test.mk
@@ -4,27 +4,26 @@
|
||||
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k
|
||||
|
||||
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
||||
# TODO: enable those tests once we fix issue in rabit
|
||||
model_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||
|
||||
model_recover_10_10k_die_same:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
|
||||
model_recover_10_10k_die_hard:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
|
||||
local_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
|
||||
pylocal_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 ./local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
|
||||
lazy_recover_10_10k_die_hard:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
|
||||
lazy_recover_10_10k_die_same:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
|
||||
ringallreduce_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10
|
||||
|
||||
Reference in New Issue
Block a user