[rabit harden] fix rabit tests (#81)

* enable model recovery tests
* force use gcc4.8 in Travis
This commit is contained in:
Chen Qin 2019-03-14 16:16:45 -07:00 committed by Jiaming Yuan
parent 1cc34f01db
commit ed06e0c6af
9 changed files with 82 additions and 44 deletions

View File

@ -7,10 +7,11 @@ env:
- TASK=lint LINT_LANG=cpp - TASK=lint LINT_LANG=cpp
- TASK=lint LINT_LANG=python - TASK=lint LINT_LANG=python
- TASK=doc - TASK=doc
- TASK=build CXX=g++ - TASK=build
- TASK=test CXX=g++ - TASK=test
# dependent apt packages # dependent apt packages
dist: xenial
addons: addons:
apt: apt:
packages: packages:
@ -21,6 +22,8 @@ addons:
- libcurl4-openssl-dev - libcurl4-openssl-dev
- unzip - unzip
- python-numpy - python-numpy
- gcc-4.8
- g++-4.8
before_install: before_install:
- git clone https://github.com/dmlc/dmlc-core - git clone https://github.com/dmlc/dmlc-core

View File

@ -1,24 +1,37 @@
OS := $(shell uname) OS := $(shell uname)
ifeq ($(OS), Linux)
ifndef CXX
export CXX = g++
endif
export MPICXX = mpicxx export MPICXX = mpicxx
export LDFLAGS= -Llib -lrt export LDFLAGS= -Llib
OS := $(shell uname)
ifeq ($(OS), Darwin)
ifndef CC
export CC = $(if $(shell which clang), clang, gcc)
endif
ifndef CXX
export CXX = $(if $(shell which clang++), clang++, g++)
endif
else
ifeq ($(OS), FreeBSD)
ifndef CXX
export CXX = g++6
endif
export MPICXX = /usr/local/mpi/bin/mpicxx
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
else
# linux defaults
ifndef CC
export CC = gcc
endif
ifndef CXX
export CXX = g++
endif
LDFLAGS += -lrt
endif
endif endif
ifeq ($(OS), FreeBSD) export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
ifndef CXX
export CXX = g++6
endif
export MPICXX = /usr/local/mpi/bin/mpicxx
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
endif
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++0x
export CFLAGS = -O3 $(WARNFLAGS) export CFLAGS = -O3 $(WARNFLAGS)
#---------------------------- #----------------------------

View File

@ -12,7 +12,7 @@
#include <ostream> #include <ostream>
#include <streambuf> #include <streambuf>
#include "dmlc/base.h" #include "base.h"
// include uint64_t only to make io standalone // include uint64_t only to make io standalone
#ifdef _MSC_VER #ifdef _MSC_VER

View File

@ -9,7 +9,7 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include "./internal/utils.h" #include "./internal/utils.h"
#include "dmlc/io.h" #include "../dmlc/io.h"
namespace rabit { namespace rabit {
/*! /*!

View File

@ -1,7 +1,9 @@
#!/bin/bash #!/bin/bash
#make -f test.mk model_recover_10_10k || exit -1
#make -f test.mk model_recover_10_10k_die_same || exit -1 make -f test.mk model_recover_10_10k || exit -1
#make -f test.mk local_recover_10_10k || exit -1 make -f test.mk model_recover_10_10k_die_same || exit -1
#make -f test.mk lazy_recover_10_10k_die_hard || exit -1 make -f test.mk model_recover_10_10k_die_hard || exit -1
#make -f test.mk lazy_recover_10_10k_die_same || exit -1 make -f test.mk local_recover_10_10k || exit -1
make -f test.mk lazy_recover_10_10k_die_hard || exit -1
make -f test.mk lazy_recover_10_10k_die_same || exit -1
make -f test.mk ringallreduce_10_10k || exit -1 make -f test.mk ringallreduce_10_10k || exit -1

View File

@ -268,8 +268,9 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model,
if (num_local_replica != 0) { if (num_local_replica != 0) {
while (true) { while (true) {
if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break; if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break;
// save model model to new version place // save model to new version place
int new_version = !local_chkpt_version; int new_version = !local_chkpt_version;
local_chkpt[new_version].clear(); local_chkpt[new_version].clear();
utils::MemoryBufferStream fs(&local_chkpt[new_version]); utils::MemoryBufferStream fs(&local_chkpt[new_version]);
if (local_model != NULL) { if (local_model != NULL) {
@ -296,6 +297,7 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model,
if (lazy_checkpt) { if (lazy_checkpt) {
global_lazycheck = global_model; global_lazycheck = global_model;
} else { } else {
printf("[%d] save global checkpoint #%d \n", this->rank, version_number);
global_checkpoint.resize(0); global_checkpoint.resize(0);
utils::MemoryBufferStream fs(&global_checkpoint); utils::MemoryBufferStream fs(&global_checkpoint);
fs.Write(&version_number, sizeof(version_number)); fs.Write(&version_number, sizeof(version_number));
@ -737,6 +739,9 @@ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
succ = TryRecoverLocalState(&local_rptr[local_chkpt_version], succ = TryRecoverLocalState(&local_rptr[local_chkpt_version],
&local_chkpt[local_chkpt_version]); &local_chkpt[local_chkpt_version]);
if (succ != kSuccess) return succ; if (succ != kSuccess) return succ;
printf("[%d] recovered from local checkpoint version %d \n", this->rank, local_chkpt_version);
int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0); int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
// check if everyone is OK // check if everyone is OK
unsigned state = 0; unsigned state = 0;

View File

@ -1,8 +1,25 @@
export CC = gcc
export CXX = g++
export MPICXX = mpicxx export MPICXX = mpicxx
export LDFLAGS= -L../lib -pthread -lm -lrt export LDFLAGS= -L../lib -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++0x export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++11
OS := $(shell uname)
ifeq ($(OS), Darwin)
ifndef CC
export CC = $(if $(shell which clang), clang, gcc)
endif
ifndef CXX
export CXX = $(if $(shell which clang++), clang++, g++)
endif
else
ifndef CC
export CC = gcc
endif
ifndef CXX
export CXX = g++
endif
LDFLAGS += -lrt
endif
# specify tensor path # specify tensor path
BIN = speed_test model_recover local_recover lazy_recover BIN = speed_test model_recover local_recover lazy_recover

View File

@ -4,6 +4,7 @@
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cmath> #include <cmath>
using namespace rabit; using namespace rabit;
// dummy model // dummy model
@ -77,10 +78,9 @@ inline void TestBcast(size_t n, int root, int ntrial, int iter) {
std::string res; std::string res;
if (root == rank) { if (root == rank) {
res = s; res = s;
rabit::Broadcast(&res, root);
} else {
rabit::Broadcast(&res, root);
} }
rabit::Broadcast(&res, root);
utils::Check(res == s, "[%d] TestBcast fail", rank); utils::Check(res == s, "[%d] TestBcast fail", rank);
} }
@ -104,10 +104,9 @@ int main(int argc, char *argv[]) {
int iter = rabit::LoadCheckPoint(&model); int iter = rabit::LoadCheckPoint(&model);
if (iter == 0) { if (iter == 0) {
model.InitModel(n); model.InitModel(n);
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
} else {
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
} }
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
for (int r = iter; r < 3; ++r) { for (int r = iter; r < 3; ++r) {
TestMax(&model, ntrial, r); TestMax(&model, ntrial, r);
printf("[%d] !!!TestMax pass, iter=%d\n", rank, r); printf("[%d] !!!TestMax pass, iter=%d\n", rank, r);
@ -119,7 +118,7 @@ int main(int argc, char *argv[]) {
TestSum(&model, ntrial, r); TestSum(&model, ntrial, r);
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
rabit::CheckPoint(&model); rabit::CheckPoint(&model);
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); printf("[%d] !!!Checkpoint pass, iter=%d\n", rank, r);
} }
rabit::Finalize(); rabit::Finalize();
return 0; return 0;

View File

@ -4,27 +4,26 @@
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k
# this experiment test recovery with actually process exit, use keepalive to keep program alive # this experiment test recovery with actually process exit, use keepalive to keep program alive
# TODO: enable those tests once we fix issue in rabit
model_recover_10_10k: model_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
model_recover_10_10k_die_same: model_recover_10_10k_die_same:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
model_recover_10_10k_die_hard: model_recover_10_10k_die_hard:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
local_recover_10_10k: local_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
pylocal_recover_10_10k: pylocal_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 ./local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
lazy_recover_10_10k_die_hard: lazy_recover_10_10k_die_hard:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
lazy_recover_10_10k_die_same: lazy_recover_10_10k_die_same:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
ringallreduce_10_10k: ringallreduce_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10