From ed06e0c6af9444ddf570291bc383fc701b9286b4 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 14 Mar 2019 16:16:45 -0700 Subject: [PATCH] [rabit harden] fix rabit tests (#81) * enable model recovery tests * force use gcc4.8 in Travis --- .travis.yml | 7 ++++-- Makefile | 43 +++++++++++++++++++++++------------- include/dmlc/io.h | 2 +- include/rabit/serializable.h | 2 +- scripts/travis_runtest.sh | 12 +++++----- src/allreduce_robust.cc | 7 +++++- test/Makefile | 25 +++++++++++++++++---- test/model_recover.cc | 13 +++++------ test/test.mk | 15 ++++++------- 9 files changed, 82 insertions(+), 44 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1e0aa244b..85bf8f090 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,10 +7,11 @@ env: - TASK=lint LINT_LANG=cpp - TASK=lint LINT_LANG=python - TASK=doc - - TASK=build CXX=g++ - - TASK=test CXX=g++ + - TASK=build + - TASK=test # dependent apt packages +dist: xenial addons: apt: packages: @@ -21,6 +22,8 @@ addons: - libcurl4-openssl-dev - unzip - python-numpy + - gcc-4.8 + - g++-4.8 before_install: - git clone https://github.com/dmlc/dmlc-core diff --git a/Makefile b/Makefile index 12226308b..816fdeb2e 100644 --- a/Makefile +++ b/Makefile @@ -1,24 +1,37 @@ OS := $(shell uname) -ifeq ($(OS), Linux) -ifndef CXX -export CXX = g++ -endif export MPICXX = mpicxx -export LDFLAGS= -Llib -lrt +export LDFLAGS= -Llib +OS := $(shell uname) + +ifeq ($(OS), Darwin) + ifndef CC + export CC = $(if $(shell which clang), clang, gcc) + endif + ifndef CXX + export CXX = $(if $(shell which clang++), clang++, g++) + endif +else + ifeq ($(OS), FreeBSD) + ifndef CXX + export CXX = g++6 + endif + export MPICXX = /usr/local/mpi/bin/mpicxx + export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6 + else + # linux defaults + ifndef CC + export CC = gcc + endif + ifndef CXX + export CXX = g++ + endif + LDFLAGS += -lrt + endif endif -ifeq ($(OS), FreeBSD) -ifndef CXX -export CXX = g++6 -endif -export MPICXX = /usr/local/mpi/bin/mpicxx -export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6 - -endif - -export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++0x +export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11 export CFLAGS = -O3 $(WARNFLAGS) #---------------------------- diff --git a/include/dmlc/io.h b/include/dmlc/io.h index 80d46200d..1965a7ad2 100644 --- a/include/dmlc/io.h +++ b/include/dmlc/io.h @@ -12,7 +12,7 @@ #include #include -#include "dmlc/base.h" +#include "base.h" // include uint64_t only to make io standalone #ifdef _MSC_VER diff --git a/include/rabit/serializable.h b/include/rabit/serializable.h index 225b2fbd1..4a3c2a115 100644 --- a/include/rabit/serializable.h +++ b/include/rabit/serializable.h @@ -9,7 +9,7 @@ #include #include #include "./internal/utils.h" -#include "dmlc/io.h" +#include "../dmlc/io.h" namespace rabit { /*! diff --git a/scripts/travis_runtest.sh b/scripts/travis_runtest.sh index 393e62770..4f14ad170 100755 --- a/scripts/travis_runtest.sh +++ b/scripts/travis_runtest.sh @@ -1,7 +1,9 @@ #!/bin/bash -#make -f test.mk model_recover_10_10k || exit -1 -#make -f test.mk model_recover_10_10k_die_same || exit -1 -#make -f test.mk local_recover_10_10k || exit -1 -#make -f test.mk lazy_recover_10_10k_die_hard || exit -1 -#make -f test.mk lazy_recover_10_10k_die_same || exit -1 + +make -f test.mk model_recover_10_10k || exit -1 +make -f test.mk model_recover_10_10k_die_same || exit -1 +make -f test.mk model_recover_10_10k_die_hard || exit -1 +make -f test.mk local_recover_10_10k || exit -1 +make -f test.mk lazy_recover_10_10k_die_hard || exit -1 +make -f test.mk lazy_recover_10_10k_die_same || exit -1 make -f test.mk ringallreduce_10_10k || exit -1 diff --git a/src/allreduce_robust.cc b/src/allreduce_robust.cc index b81af3e48..a9a89af07 100644 --- a/src/allreduce_robust.cc +++ b/src/allreduce_robust.cc @@ -268,8 +268,9 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model, if (num_local_replica != 0) { while (true) { if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break; - // save model model to new version place + // save model to new version place int new_version = !local_chkpt_version; + local_chkpt[new_version].clear(); utils::MemoryBufferStream fs(&local_chkpt[new_version]); if (local_model != NULL) { @@ -296,6 +297,7 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model, if (lazy_checkpt) { global_lazycheck = global_model; } else { + printf("[%d] save global checkpoint #%d \n", this->rank, version_number); global_checkpoint.resize(0); utils::MemoryBufferStream fs(&global_checkpoint); fs.Write(&version_number, sizeof(version_number)); @@ -737,6 +739,9 @@ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) { succ = TryRecoverLocalState(&local_rptr[local_chkpt_version], &local_chkpt[local_chkpt_version]); if (succ != kSuccess) return succ; + + printf("[%d] recovered from local checkpoint version %d \n", this->rank, local_chkpt_version); + int nlocal = std::max(static_cast(local_rptr[local_chkpt_version].size()) - 1, 0); // check if everyone is OK unsigned state = 0; diff --git a/test/Makefile b/test/Makefile index 91a43e1fb..4386c2e5e 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,8 +1,25 @@ -export CC = gcc -export CXX = g++ export MPICXX = mpicxx -export LDFLAGS= -L../lib -pthread -lm -lrt -export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++0x +export LDFLAGS= -L../lib -pthread -lm +export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++11 + +OS := $(shell uname) + +ifeq ($(OS), Darwin) + ifndef CC + export CC = $(if $(shell which clang), clang, gcc) + endif + ifndef CXX + export CXX = $(if $(shell which clang++), clang++, g++) + endif +else + ifndef CC + export CC = gcc + endif + ifndef CXX + export CXX = g++ + endif + LDFLAGS += -lrt +endif # specify tensor path BIN = speed_test model_recover local_recover lazy_recover diff --git a/test/model_recover.cc b/test/model_recover.cc index a2709f892..e67208f7a 100644 --- a/test/model_recover.cc +++ b/test/model_recover.cc @@ -4,6 +4,7 @@ #include #include #include + using namespace rabit; // dummy model @@ -77,10 +78,9 @@ inline void TestBcast(size_t n, int root, int ntrial, int iter) { std::string res; if (root == rank) { res = s; - rabit::Broadcast(&res, root); - } else { - rabit::Broadcast(&res, root); } + rabit::Broadcast(&res, root); + utils::Check(res == s, "[%d] TestBcast fail", rank); } @@ -104,10 +104,9 @@ int main(int argc, char *argv[]) { int iter = rabit::LoadCheckPoint(&model); if (iter == 0) { model.InitModel(n); - printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter); - } else { - printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter); } + printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter); + for (int r = iter; r < 3; ++r) { TestMax(&model, ntrial, r); printf("[%d] !!!TestMax pass, iter=%d\n", rank, r); @@ -119,7 +118,7 @@ int main(int argc, char *argv[]) { TestSum(&model, ntrial, r); printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); rabit::CheckPoint(&model); - printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); + printf("[%d] !!!Checkpoint pass, iter=%d\n", rank, r); } rabit::Finalize(); return 0; diff --git a/test/test.mk b/test/test.mk index 9dfebb02e..7b9995070 100644 --- a/test/test.mk +++ b/test/test.mk @@ -4,27 +4,26 @@ all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k # this experiment test recovery with actually process exit, use keepalive to keep program alive -# TODO: enable those tests once we fix issue in rabit model_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 model_recover_10_10k_die_same: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 model_recover_10_10k_die_hard: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 local_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 pylocal_recover_10_10k: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 ./local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 lazy_recover_10_10k_die_hard: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 lazy_recover_10_10k_die_same: - ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 + ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 ringallreduce_10_10k: ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10