[rabit harden] fix rabit tests (#81)
* enable model recovery tests * force use gcc4.8 in Travis
This commit is contained in:
parent
1cc34f01db
commit
ed06e0c6af
@ -7,10 +7,11 @@ env:
|
|||||||
- TASK=lint LINT_LANG=cpp
|
- TASK=lint LINT_LANG=cpp
|
||||||
- TASK=lint LINT_LANG=python
|
- TASK=lint LINT_LANG=python
|
||||||
- TASK=doc
|
- TASK=doc
|
||||||
- TASK=build CXX=g++
|
- TASK=build
|
||||||
- TASK=test CXX=g++
|
- TASK=test
|
||||||
|
|
||||||
# dependent apt packages
|
# dependent apt packages
|
||||||
|
dist: xenial
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
packages:
|
packages:
|
||||||
@ -21,6 +22,8 @@ addons:
|
|||||||
- libcurl4-openssl-dev
|
- libcurl4-openssl-dev
|
||||||
- unzip
|
- unzip
|
||||||
- python-numpy
|
- python-numpy
|
||||||
|
- gcc-4.8
|
||||||
|
- g++-4.8
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- git clone https://github.com/dmlc/dmlc-core
|
- git clone https://github.com/dmlc/dmlc-core
|
||||||
|
|||||||
29
Makefile
29
Makefile
@ -1,24 +1,37 @@
|
|||||||
OS := $(shell uname)
|
OS := $(shell uname)
|
||||||
|
|
||||||
ifeq ($(OS), Linux)
|
|
||||||
ifndef CXX
|
|
||||||
export CXX = g++
|
|
||||||
endif
|
|
||||||
export MPICXX = mpicxx
|
export MPICXX = mpicxx
|
||||||
export LDFLAGS= -Llib -lrt
|
export LDFLAGS= -Llib
|
||||||
|
|
||||||
|
OS := $(shell uname)
|
||||||
|
|
||||||
|
ifeq ($(OS), Darwin)
|
||||||
|
ifndef CC
|
||||||
|
export CC = $(if $(shell which clang), clang, gcc)
|
||||||
endif
|
endif
|
||||||
|
ifndef CXX
|
||||||
|
export CXX = $(if $(shell which clang++), clang++, g++)
|
||||||
|
endif
|
||||||
|
else
|
||||||
ifeq ($(OS), FreeBSD)
|
ifeq ($(OS), FreeBSD)
|
||||||
ifndef CXX
|
ifndef CXX
|
||||||
export CXX = g++6
|
export CXX = g++6
|
||||||
endif
|
endif
|
||||||
export MPICXX = /usr/local/mpi/bin/mpicxx
|
export MPICXX = /usr/local/mpi/bin/mpicxx
|
||||||
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
|
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
|
||||||
|
else
|
||||||
|
# linux defaults
|
||||||
|
ifndef CC
|
||||||
|
export CC = gcc
|
||||||
|
endif
|
||||||
|
ifndef CXX
|
||||||
|
export CXX = g++
|
||||||
|
endif
|
||||||
|
LDFLAGS += -lrt
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++0x
|
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
|
||||||
export CFLAGS = -O3 $(WARNFLAGS)
|
export CFLAGS = -O3 $(WARNFLAGS)
|
||||||
|
|
||||||
#----------------------------
|
#----------------------------
|
||||||
|
|||||||
@ -12,7 +12,7 @@
|
|||||||
#include <ostream>
|
#include <ostream>
|
||||||
#include <streambuf>
|
#include <streambuf>
|
||||||
|
|
||||||
#include "dmlc/base.h"
|
#include "base.h"
|
||||||
|
|
||||||
// include uint64_t only to make io standalone
|
// include uint64_t only to make io standalone
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
|
|||||||
@ -9,7 +9,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "./internal/utils.h"
|
#include "./internal/utils.h"
|
||||||
#include "dmlc/io.h"
|
#include "../dmlc/io.h"
|
||||||
|
|
||||||
namespace rabit {
|
namespace rabit {
|
||||||
/*!
|
/*!
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#make -f test.mk model_recover_10_10k || exit -1
|
|
||||||
#make -f test.mk model_recover_10_10k_die_same || exit -1
|
make -f test.mk model_recover_10_10k || exit -1
|
||||||
#make -f test.mk local_recover_10_10k || exit -1
|
make -f test.mk model_recover_10_10k_die_same || exit -1
|
||||||
#make -f test.mk lazy_recover_10_10k_die_hard || exit -1
|
make -f test.mk model_recover_10_10k_die_hard || exit -1
|
||||||
#make -f test.mk lazy_recover_10_10k_die_same || exit -1
|
make -f test.mk local_recover_10_10k || exit -1
|
||||||
|
make -f test.mk lazy_recover_10_10k_die_hard || exit -1
|
||||||
|
make -f test.mk lazy_recover_10_10k_die_same || exit -1
|
||||||
make -f test.mk ringallreduce_10_10k || exit -1
|
make -f test.mk ringallreduce_10_10k || exit -1
|
||||||
|
|||||||
@ -268,8 +268,9 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model,
|
|||||||
if (num_local_replica != 0) {
|
if (num_local_replica != 0) {
|
||||||
while (true) {
|
while (true) {
|
||||||
if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break;
|
if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break;
|
||||||
// save model model to new version place
|
// save model to new version place
|
||||||
int new_version = !local_chkpt_version;
|
int new_version = !local_chkpt_version;
|
||||||
|
|
||||||
local_chkpt[new_version].clear();
|
local_chkpt[new_version].clear();
|
||||||
utils::MemoryBufferStream fs(&local_chkpt[new_version]);
|
utils::MemoryBufferStream fs(&local_chkpt[new_version]);
|
||||||
if (local_model != NULL) {
|
if (local_model != NULL) {
|
||||||
@ -296,6 +297,7 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model,
|
|||||||
if (lazy_checkpt) {
|
if (lazy_checkpt) {
|
||||||
global_lazycheck = global_model;
|
global_lazycheck = global_model;
|
||||||
} else {
|
} else {
|
||||||
|
printf("[%d] save global checkpoint #%d \n", this->rank, version_number);
|
||||||
global_checkpoint.resize(0);
|
global_checkpoint.resize(0);
|
||||||
utils::MemoryBufferStream fs(&global_checkpoint);
|
utils::MemoryBufferStream fs(&global_checkpoint);
|
||||||
fs.Write(&version_number, sizeof(version_number));
|
fs.Write(&version_number, sizeof(version_number));
|
||||||
@ -737,6 +739,9 @@ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
|
|||||||
succ = TryRecoverLocalState(&local_rptr[local_chkpt_version],
|
succ = TryRecoverLocalState(&local_rptr[local_chkpt_version],
|
||||||
&local_chkpt[local_chkpt_version]);
|
&local_chkpt[local_chkpt_version]);
|
||||||
if (succ != kSuccess) return succ;
|
if (succ != kSuccess) return succ;
|
||||||
|
|
||||||
|
printf("[%d] recovered from local checkpoint version %d \n", this->rank, local_chkpt_version);
|
||||||
|
|
||||||
int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
|
int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
|
||||||
// check if everyone is OK
|
// check if everyone is OK
|
||||||
unsigned state = 0;
|
unsigned state = 0;
|
||||||
|
|||||||
@ -1,8 +1,25 @@
|
|||||||
export CC = gcc
|
|
||||||
export CXX = g++
|
|
||||||
export MPICXX = mpicxx
|
export MPICXX = mpicxx
|
||||||
export LDFLAGS= -L../lib -pthread -lm -lrt
|
export LDFLAGS= -L../lib -pthread -lm
|
||||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++0x
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++11
|
||||||
|
|
||||||
|
OS := $(shell uname)
|
||||||
|
|
||||||
|
ifeq ($(OS), Darwin)
|
||||||
|
ifndef CC
|
||||||
|
export CC = $(if $(shell which clang), clang, gcc)
|
||||||
|
endif
|
||||||
|
ifndef CXX
|
||||||
|
export CXX = $(if $(shell which clang++), clang++, g++)
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
ifndef CC
|
||||||
|
export CC = gcc
|
||||||
|
endif
|
||||||
|
ifndef CXX
|
||||||
|
export CXX = g++
|
||||||
|
endif
|
||||||
|
LDFLAGS += -lrt
|
||||||
|
endif
|
||||||
|
|
||||||
# specify tensor path
|
# specify tensor path
|
||||||
BIN = speed_test model_recover local_recover lazy_recover
|
BIN = speed_test model_recover local_recover lazy_recover
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
using namespace rabit;
|
using namespace rabit;
|
||||||
|
|
||||||
// dummy model
|
// dummy model
|
||||||
@ -77,10 +78,9 @@ inline void TestBcast(size_t n, int root, int ntrial, int iter) {
|
|||||||
std::string res;
|
std::string res;
|
||||||
if (root == rank) {
|
if (root == rank) {
|
||||||
res = s;
|
res = s;
|
||||||
rabit::Broadcast(&res, root);
|
|
||||||
} else {
|
|
||||||
rabit::Broadcast(&res, root);
|
|
||||||
}
|
}
|
||||||
|
rabit::Broadcast(&res, root);
|
||||||
|
|
||||||
utils::Check(res == s, "[%d] TestBcast fail", rank);
|
utils::Check(res == s, "[%d] TestBcast fail", rank);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,10 +104,9 @@ int main(int argc, char *argv[]) {
|
|||||||
int iter = rabit::LoadCheckPoint(&model);
|
int iter = rabit::LoadCheckPoint(&model);
|
||||||
if (iter == 0) {
|
if (iter == 0) {
|
||||||
model.InitModel(n);
|
model.InitModel(n);
|
||||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
|
||||||
} else {
|
|
||||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
|
||||||
}
|
}
|
||||||
|
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
||||||
|
|
||||||
for (int r = iter; r < 3; ++r) {
|
for (int r = iter; r < 3; ++r) {
|
||||||
TestMax(&model, ntrial, r);
|
TestMax(&model, ntrial, r);
|
||||||
printf("[%d] !!!TestMax pass, iter=%d\n", rank, r);
|
printf("[%d] !!!TestMax pass, iter=%d\n", rank, r);
|
||||||
@ -119,7 +118,7 @@ int main(int argc, char *argv[]) {
|
|||||||
TestSum(&model, ntrial, r);
|
TestSum(&model, ntrial, r);
|
||||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
||||||
rabit::CheckPoint(&model);
|
rabit::CheckPoint(&model);
|
||||||
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
|
printf("[%d] !!!Checkpoint pass, iter=%d\n", rank, r);
|
||||||
}
|
}
|
||||||
rabit::Finalize();
|
rabit::Finalize();
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
15
test/test.mk
15
test/test.mk
@ -4,27 +4,26 @@
|
|||||||
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k
|
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k
|
||||||
|
|
||||||
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
||||||
# TODO: enable those tests once we fix issue in rabit
|
|
||||||
model_recover_10_10k:
|
model_recover_10_10k:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||||
|
|
||||||
model_recover_10_10k_die_same:
|
model_recover_10_10k_die_same:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||||
|
|
||||||
model_recover_10_10k_die_hard:
|
model_recover_10_10k_die_hard:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||||
|
|
||||||
local_recover_10_10k:
|
local_recover_10_10k:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||||
|
|
||||||
pylocal_recover_10_10k:
|
pylocal_recover_10_10k:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 ./local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||||
|
|
||||||
lazy_recover_10_10k_die_hard:
|
lazy_recover_10_10k_die_hard:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||||
|
|
||||||
lazy_recover_10_10k_die_same:
|
lazy_recover_10_10k_die_same:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||||
|
|
||||||
ringallreduce_10_10k:
|
ringallreduce_10_10k:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user