[rabit harden] Enable all tests (#90)

* include osx in tests
* address `time_wait` on port assignment
* increase submit attempts.
* cleanup tests
This commit is contained in:
Chen Qin 2019-04-24 04:12:11 -07:00 committed by Jiaming Yuan
parent ecd4bf7aae
commit e3d51d3e62
17 changed files with 168 additions and 66 deletions

6
.gitignore vendored
View File

@ -37,8 +37,12 @@ recommonmark
recom recom
_* _*
#mpi lib
mpich/
mpich-3.2/
# Jetbrain # Jetbrain
.idea .idea
cmake-build-debug/ cmake-build-debug/
.vscode/

View File

@ -1,5 +1,12 @@
# disable sudo to use container based build sudo: true
sudo: false
os:
- linux
- osx
osx_image: xcode10.2
dist: xenial
# Use Build Matrix to do lint and build seperately # Use Build Matrix to do lint and build seperately
env: env:
@ -10,15 +17,17 @@ env:
- TASK=build - TASK=build
- TASK=mpi-build - TASK=mpi-build
- TASK=cmake-build - TASK=cmake-build
- TASK=test CXX=g++-4.8 - TASK=test CXX=g++
# dependent apt packages # dependent apt packages
dist: xenial
addons: addons:
apt: apt:
sources:
- llvm-toolchain-trusty-5.0
- ubuntu-toolchain-r-test
- george-edison55-precise-backports
packages: packages:
- doxygen - doxygen
- libopenmpi-dev
- wget - wget
- git - git
- libcurl4-openssl-dev - libcurl4-openssl-dev
@ -26,30 +35,43 @@ addons:
- python-numpy - python-numpy
- gcc-4.8 - gcc-4.8
- g++-4.8 - g++-4.8
- openmpi-bin
- openmpi-common
- openssh-client - openssh-client
- openssh-server - openssh-server
- libopenmpi-dev - python3
- python3-setuptools
homebrew:
packages:
- gcc49
- openssl
- libgit2
- python3
update: true
before_install: before_install:
- export TRAVIS=dmlc-core/scripts/travis/ - export TRAVIS=dmlc-core/scripts/travis/
- source ${TRAVIS}/travis_setup_env.sh - source ${TRAVIS}/travis_setup_env.sh
- ${TRAVIS}/travis_osx_install.sh
install: install:
- pip install --user cpplint pylint kubernetes urllib3 - if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi
- if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python3; fi
- pip3 install cpplint pylint urllib3 numpy
- pip3 install websocket-client kubernetes
script: scripts/travis_script.sh script: scripts/travis_script.sh
before_cache:
- ${TRAVIS}/travis_before_cache.sh
cache: cache:
directories: directories:
- ${HOME}/.cache/usr - ${HOME}/.cache/usr
- ${HOME}/.cache/pip
- mpich
before_cache:
- ${TRAVIS}/travis_before_cache.sh
after_success:
- tree build
- bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c'
notifications: notifications:
# Emails are sent to the committer's git-configured email address by default, # Emails are sent to the committer's git-configured email address by default,
@ -57,4 +79,3 @@ notifications:
on_success: change on_success: change
on_failure: always on_failure: always

View File

@ -1,23 +1,26 @@
OS := $(shell uname) OS := $(shell uname)
export MPICXX = mpicxx export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
export LDFLAGS= -Llib export CFLAGS = -O3 $(WARNFLAGS) -I $(DMLC)/include -I include/
export LDFLAGS =-Llib
OS := $(shell uname) #download mpi
#echo $(shell scripts/mpi.sh)
MPICXX=./mpich/bin/mpicxx
ifeq ($(OS), Darwin) ifeq ($(OS), Darwin)
ifndef CC ifndef CC
export CC = $(if $(shell which clang), clang, gcc) export CC = gcc-4.9
endif endif
ifndef CXX ifndef CXX
export CXX = $(if $(shell which clang++), clang++, g++) export CXX = g++-4.9
endif endif
else else
ifeq ($(OS), FreeBSD) ifeq ($(OS), FreeBSD)
ifndef CXX ifndef CXX
export CXX = g++6 export CXX = g++6
endif endif
export MPICXX = /usr/local/mpi/bin/mpicxx
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6 export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
else else
# linux defaults # linux defaults
@ -27,13 +30,10 @@ else
ifndef CXX ifndef CXX
export CXX = g++ export CXX = g++
endif endif
LDFLAGS += -lrt LDFLAGS +=-lrt
endif endif
endif endif
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
export CFLAGS = -O3 $(WARNFLAGS)
#---------------------------- #----------------------------
# Settings for power and arm arch # Settings for power and arm arch
#---------------------------- #----------------------------
@ -69,8 +69,10 @@ BPATH=.
MPIOBJ= $(BPATH)/engine_mpi.o MPIOBJ= $(BPATH)/engine_mpi.o
OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\ OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
$(BPATH)/c_api.o $(BPATH)/engine_base.o $(BPATH)/c_api.o $(BPATH)/engine_base.o
SLIB= lib/librabit.so lib/librabit_mpi.so lib/librabit_mock.so lib/librabit_base.so SLIB= lib/librabit.so lib/librabit_mock.so lib/librabit_base.so
ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a ALIB= lib/librabit.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
MPISLIB= lib/librabit_mpi.so
MPIALIB= lib/librabit_mpi.a
HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h
DMLC=dmlc-core DMLC=dmlc-core
@ -95,10 +97,7 @@ lib/librabit_empty.a: $(BPATH)/engine_empty.o $(BPATH)/c_api.o
lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ) lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ)
$(OBJ) : $(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I include/ -I $(DMLC)/include $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I $(DMLC)/include
$(ALIB): $(ALIB):
ar cr $@ $+ ar cr $@ $+
@ -106,6 +105,16 @@ $(ALIB):
$(SLIB) : $(SLIB) :
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS) $(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -I./mpich/include -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
$(MPIALIB):
ar cr $@ $+
$(MPISLIB) :
$(MPICXX) $(CFLAGS) -I./mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) \
$(LDFLAGS) -L./mpich/lib -Wl,-rpath,./mpich/lib -lmpi
lint: lint:
$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include $(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include
@ -113,4 +122,4 @@ doc doxygen:
cd include; doxygen ../doc/Doxyfile; cd - cd include; doxygen ../doc/Doxyfile; cd -
clean: clean:
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~ $(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~

@ -1 +1 @@
Subproject commit 15362f8fcc7345d60de13a676a2cbd3ffdc3f064 Subproject commit 13d5acb8ba7e79550bbf2f730f1a3944ff0fa68b

View File

@ -4,7 +4,7 @@ Reliable Allreduce and Broadcast Library.
Author: Tianqi Chen Author: Tianqi Chen
""" """
# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value, # pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
import cPickle as pickle import pickle
import ctypes import ctypes
import os import os
import sys import sys
@ -99,9 +99,10 @@ def init(args=None, lib='standard', lib_dll=None):
When this is presented argument lib will be ignored. When this is presented argument lib will be ignored.
""" """
if args is None: if args is None:
args = sys.argv args = []
_loadlib(lib, lib_dll) _loadlib(lib, lib_dll)
arr = (ctypes.c_char_p * len(args))() arr = (ctypes.c_char_p * len(args))()
arr[:] = args arr[:] = args
_LIB.RabitInit(len(args), arr) _LIB.RabitInit(len(args), arr)

27
scripts/mpi_build.sh Executable file
View File

@ -0,0 +1,27 @@
#!/usr/bin/env bash
if [ -f mpich/lib/libmpich.so ]; then
echo "libmpich.so found -- nothing to build."
else
echo "Downloading mpich source."
wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz
tar xfz mpich-3.2.tar.gz
rm mpich-3.2.tar.gz*
echo "configuring and building mpich."
cd mpich-3.2
#CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64
./configure \
--prefix=`pwd`/../mpich \
--enable-static=false \
--enable-alloca=true \
--disable-long-double \
--enable-threads=single \
--enable-fortran=no \
--enable-fast=all \
--enable-g=none \
--enable-timing=none \
--enable-cxx
make -j4
make install
cd -
fi

View File

@ -7,3 +7,4 @@ make -f test.mk local_recover_10_10k || exit -1
make -f test.mk lazy_recover_10_10k_die_hard || exit -1 make -f test.mk lazy_recover_10_10k_die_hard || exit -1
make -f test.mk lazy_recover_10_10k_die_same || exit -1 make -f test.mk lazy_recover_10_10k_die_same || exit -1
make -f test.mk ringallreduce_10_10k || exit -1 make -f test.mk ringallreduce_10_10k || exit -1
make -f test.mk pylocal_recover_10_10k || exit -1

View File

@ -15,6 +15,7 @@ if [ ${TASK} == "build" ]; then
fi fi
if [ ${TASK} == "mpi-build" ]; then if [ ${TASK} == "mpi-build" ]; then
./scripts/mpi_build.sh
cd test cd test
make mpi && make speed_test.mpi || exit -1 make mpi && make speed_test.mpi || exit -1
fi fi

View File

@ -131,6 +131,8 @@ void AllreduceBase::Shutdown(void) {
utils::TCPSocket tracker = this->ConnectTracker(); utils::TCPSocket tracker = this->ConnectTracker();
tracker.SendStr(std::string("shutdown")); tracker.SendStr(std::string("shutdown"));
tracker.Close(); tracker.Close();
// close listening sockets
sock_listen.Close();
utils::TCPSocket::Finalize(); utils::TCPSocket::Finalize();
} }
void AllreduceBase::TrackerPrint(const std::string &msg) { void AllreduceBase::TrackerPrint(const std::string &msg) {
@ -271,12 +273,26 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
"ReConnectLink failure 4"); "ReConnectLink failure 4");
Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank), Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank),
"ReConnectLink failure 4"); "ReConnectLink failure 4");
// create listening socket
utils::TCPSocket sock_listen; if (sock_listen == INVALID_SOCKET || sock_listen.AtMark()) {
sock_listen.Create(); if (!sock_listen.IsClosed()) {
int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial); sock_listen.Close();
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified"); }
sock_listen.Listen(); // create listening socket
sock_listen.Create();
sock_listen.SetKeepAlive(true);
// http://deepix.github.io/2016/10/21/tcprst.html
sock_listen.SetLinger(0);
// [slave_port, slave_port+1 .... slave_port + newrank ...slave_port + nport_trial)
// work around processes bind to same port without set reuse option,
// start explore from slave_port + newrank towards end
port = sock_listen.TryBindHost(slave_port+ newrank%nport_trial, slave_port + nport_trial);
// if no port bindable, explore first half of range
if (port == -1) sock_listen.TryBindHost(slave_port, newrank% nport_trial + slave_port);
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
sock_listen.Listen();
}
// get number of to connect and number of to accept nodes from tracker // get number of to connect and number of to accept nodes from tracker
int num_conn, num_accept, num_error = 1; int num_conn, num_accept, num_error = 1;
@ -311,6 +327,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
"ReConnectLink failure 9"); "ReConnectLink failure 9");
Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank), Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank),
"ReConnectLink failure 10"); "ReConnectLink failure 10");
r.sock.Create(); r.sock.Create();
if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) { if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
num_error += 1; r.sock.Close(); continue; num_error += 1; r.sock.Close(); continue;
@ -357,8 +374,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
} }
if (!match) all_links.push_back(r); if (!match) all_links.push_back(r);
} }
// close listening sockets
sock_listen.Close();
this->parent_index = -1; this->parent_index = -1;
// setup tree links and ring structure // setup tree links and ring structure
tree_links.plinks.clear(); tree_links.plinks.clear();

View File

@ -521,6 +521,10 @@ class AllreduceBase : public IEngine {
int world_size; int world_size;
// connect retry time // connect retry time
int connect_retry; int connect_retry;
// backdoor listening peer connection
utils::TCPSocket sock_listen;
// backdoor port
int port = 0;
}; };
} // namespace engine } // namespace engine
} // namespace rabit } // namespace rabit

View File

@ -51,12 +51,10 @@ void AllreduceRobust::Shutdown(void) {
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp), utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
"Shutdown: check ack must return true"); "Shutdown: check ack must return true");
// one worker shutdowns and closes sockets while rest still run kCheckAck, #if defined (__APPLE__)
// seems has something to do with time-wait state in tcp connection, sleep(1);
// this cause rest workers checkandrecover and hang inf, #endif
// https://github.com/dmlc/xgboost/pull/3818
// TODO(Chen Qin): a fundamental fix for this
sleep(1);
AllreduceBase::Shutdown(); AllreduceBase::Shutdown();
} }
/*! /*!

View File

@ -276,13 +276,21 @@ class TCPSocket : public Socket{
* \brief enable/disable TCP keepalive * \brief enable/disable TCP keepalive
* \param keepalive whether to set the keep alive option on * \param keepalive whether to set the keep alive option on
*/ */
inline void SetKeepAlive(bool keepalive) { void SetKeepAlive(bool keepalive) {
int opt = static_cast<int>(keepalive); int opt = static_cast<int>(keepalive);
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) { reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
Socket::Error("SetKeepAlive"); Socket::Error("SetKeepAlive");
} }
} }
inline void SetLinger(int timeout = 0) {
struct linger sl;
sl.l_onoff = 1; /* non-zero value enables linger option in kernel */
sl.l_linger = timeout; /* timeout interval in seconds */
if (setsockopt(sockfd, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)) == -1) {
Socket::Error("SO_LINGER");
}
}
/*! /*!
* \brief create the socket, call this before using socket * \brief create the socket, call this before using socket
* \param af domain * \param af domain

View File

@ -1,4 +1,4 @@
export MPICXX = mpicxx MPICXX=../mpich/bin/mpicxx
export LDFLAGS= -L../lib -pthread -lm export LDFLAGS= -L../lib -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11 export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11
@ -27,11 +27,17 @@ OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
MPIBIN = speed_test.mpi MPIBIN = speed_test.mpi
.PHONY: clean all lib mpi .PHONY: clean all lib mpi
.PHONY: lib all
all: $(BIN) all: $(BIN)
lib: lib:
cd ..;make;cd - cd ..;make clean;make;cd -
.PHONY: mpi
mpi: mpi:
cd ..;make mpi;cd - cd ..;make mpi;cd -
# programs # programs
speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
model_recover.o: model_recover.cc ../include/rabit/*.h lib model_recover.o: model_recover.cc ../include/rabit/*.h lib
@ -52,7 +58,8 @@ $(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
$(MPIBIN) : $(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mpi.so $(LDFLAGS) $(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \
../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi
clean: clean:
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~ $(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~

View File

@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
TestSum(&model, ntrial, r); TestSum(&model, ntrial, r);
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
rabit::LazyCheckPoint(&model); rabit::LazyCheckPoint(&model);
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
} }
rabit::Finalize(); rabit::Finalize();
return 0; return 0;

View File

@ -130,7 +130,7 @@ int main(int argc, char *argv[]) {
TestSum(&model, &local, ntrial, r); TestSum(&model, &local, ntrial, r);
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r); printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
rabit::CheckPoint(&model, &local); rabit::CheckPoint(&model, &local);
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r); printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
} }
rabit::Finalize(); rabit::Finalize();
return 0; return 0;

View File

@ -1,6 +1,11 @@
#!/usr/bin/python #!/usr/bin/env python3
from __future__ import print_function from __future__ import print_function
from builtins import range from builtins import range
import sys
sys.path.append('../python')
import rabit import rabit
import numpy as np import numpy as np

View File

@ -1,29 +1,29 @@
# this is a makefile used to show testcases of rabit # this is a makefile used to show testcases of rabit
.PHONY: all .PHONY: all
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k
# this experiment test recovery with actually process exit, use keepalive to keep program alive # this experiment test recovery with actually process exit, use keepalive to keep program alive
model_recover_10_10k: model_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
model_recover_10_10k_die_same: model_recover_10_10k_die_same:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
model_recover_10_10k_die_hard: model_recover_10_10k_die_hard:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
local_recover_10_10k: local_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
pylocal_recover_10_10k: pylocal_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
lazy_recover_10_10k_die_hard: lazy_recover_10_10k_die_hard:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
lazy_recover_10_10k_die_same: lazy_recover_10_10k_die_same:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
ringallreduce_10_10k: ringallreduce_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10 ../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10