[rabit harden] Enable all tests (#90)
* include osx in tests * address `time_wait` on port assignment * increase submit attempts. * cleanup tests
This commit is contained in:
parent
ecd4bf7aae
commit
e3d51d3e62
6
.gitignore
vendored
6
.gitignore
vendored
@ -37,8 +37,12 @@ recommonmark
|
||||
recom
|
||||
_*
|
||||
|
||||
#mpi lib
|
||||
mpich/
|
||||
mpich-3.2/
|
||||
|
||||
# Jetbrain
|
||||
.idea
|
||||
cmake-build-debug/
|
||||
|
||||
.vscode/
|
||||
|
||||
|
||||
51
.travis.yml
51
.travis.yml
@ -1,5 +1,12 @@
|
||||
# disable sudo to use container based build
|
||||
sudo: false
|
||||
sudo: true
|
||||
|
||||
os:
|
||||
- linux
|
||||
- osx
|
||||
|
||||
osx_image: xcode10.2
|
||||
|
||||
dist: xenial
|
||||
|
||||
# Use Build Matrix to do lint and build seperately
|
||||
env:
|
||||
@ -10,15 +17,17 @@ env:
|
||||
- TASK=build
|
||||
- TASK=mpi-build
|
||||
- TASK=cmake-build
|
||||
- TASK=test CXX=g++-4.8
|
||||
- TASK=test CXX=g++
|
||||
|
||||
# dependent apt packages
|
||||
dist: xenial
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- llvm-toolchain-trusty-5.0
|
||||
- ubuntu-toolchain-r-test
|
||||
- george-edison55-precise-backports
|
||||
packages:
|
||||
- doxygen
|
||||
- libopenmpi-dev
|
||||
- wget
|
||||
- git
|
||||
- libcurl4-openssl-dev
|
||||
@ -26,30 +35,43 @@ addons:
|
||||
- python-numpy
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- openmpi-bin
|
||||
- openmpi-common
|
||||
- openssh-client
|
||||
- openssh-server
|
||||
- libopenmpi-dev
|
||||
- python3
|
||||
- python3-setuptools
|
||||
homebrew:
|
||||
packages:
|
||||
- gcc49
|
||||
- openssl
|
||||
- libgit2
|
||||
- python3
|
||||
update: true
|
||||
|
||||
before_install:
|
||||
- export TRAVIS=dmlc-core/scripts/travis/
|
||||
- source ${TRAVIS}/travis_setup_env.sh
|
||||
- ${TRAVIS}/travis_osx_install.sh
|
||||
|
||||
install:
|
||||
- pip install --user cpplint pylint kubernetes urllib3
|
||||
- if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi
|
||||
- if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python3; fi
|
||||
- pip3 install cpplint pylint urllib3 numpy
|
||||
- pip3 install websocket-client kubernetes
|
||||
|
||||
script: scripts/travis_script.sh
|
||||
|
||||
|
||||
before_cache:
|
||||
- ${TRAVIS}/travis_before_cache.sh
|
||||
|
||||
|
||||
cache:
|
||||
directories:
|
||||
- ${HOME}/.cache/usr
|
||||
- ${HOME}/.cache/pip
|
||||
- mpich
|
||||
|
||||
before_cache:
|
||||
- ${TRAVIS}/travis_before_cache.sh
|
||||
|
||||
after_success:
|
||||
- tree build
|
||||
- bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c'
|
||||
|
||||
notifications:
|
||||
# Emails are sent to the committer's git-configured email address by default,
|
||||
@ -57,4 +79,3 @@ notifications:
|
||||
on_success: change
|
||||
on_failure: always
|
||||
|
||||
|
||||
|
||||
43
Makefile
43
Makefile
@ -1,23 +1,26 @@
|
||||
OS := $(shell uname)
|
||||
|
||||
export MPICXX = mpicxx
|
||||
export LDFLAGS= -Llib
|
||||
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
|
||||
export CFLAGS = -O3 $(WARNFLAGS) -I $(DMLC)/include -I include/
|
||||
export LDFLAGS =-Llib
|
||||
|
||||
OS := $(shell uname)
|
||||
#download mpi
|
||||
#echo $(shell scripts/mpi.sh)
|
||||
|
||||
MPICXX=./mpich/bin/mpicxx
|
||||
|
||||
ifeq ($(OS), Darwin)
|
||||
ifndef CC
|
||||
export CC = $(if $(shell which clang), clang, gcc)
|
||||
export CC = gcc-4.9
|
||||
endif
|
||||
ifndef CXX
|
||||
export CXX = $(if $(shell which clang++), clang++, g++)
|
||||
export CXX = g++-4.9
|
||||
endif
|
||||
else
|
||||
ifeq ($(OS), FreeBSD)
|
||||
ifndef CXX
|
||||
export CXX = g++6
|
||||
endif
|
||||
export MPICXX = /usr/local/mpi/bin/mpicxx
|
||||
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
|
||||
else
|
||||
# linux defaults
|
||||
@ -27,13 +30,10 @@ else
|
||||
ifndef CXX
|
||||
export CXX = g++
|
||||
endif
|
||||
LDFLAGS += -lrt
|
||||
LDFLAGS +=-lrt
|
||||
endif
|
||||
endif
|
||||
|
||||
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
|
||||
export CFLAGS = -O3 $(WARNFLAGS)
|
||||
|
||||
#----------------------------
|
||||
# Settings for power and arm arch
|
||||
#----------------------------
|
||||
@ -69,8 +69,10 @@ BPATH=.
|
||||
MPIOBJ= $(BPATH)/engine_mpi.o
|
||||
OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
|
||||
$(BPATH)/c_api.o $(BPATH)/engine_base.o
|
||||
SLIB= lib/librabit.so lib/librabit_mpi.so lib/librabit_mock.so lib/librabit_base.so
|
||||
ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
|
||||
SLIB= lib/librabit.so lib/librabit_mock.so lib/librabit_base.so
|
||||
ALIB= lib/librabit.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
|
||||
MPISLIB= lib/librabit_mpi.so
|
||||
MPIALIB= lib/librabit_mpi.a
|
||||
HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h
|
||||
DMLC=dmlc-core
|
||||
|
||||
@ -95,10 +97,7 @@ lib/librabit_empty.a: $(BPATH)/engine_empty.o $(BPATH)/c_api.o
|
||||
lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ)
|
||||
|
||||
$(OBJ) :
|
||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I include/ -I $(DMLC)/include
|
||||
|
||||
$(MPIOBJ) :
|
||||
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I $(DMLC)/include
|
||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||
|
||||
$(ALIB):
|
||||
ar cr $@ $+
|
||||
@ -106,6 +105,16 @@ $(ALIB):
|
||||
$(SLIB) :
|
||||
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
||||
|
||||
$(MPIOBJ) :
|
||||
$(MPICXX) -c $(CFLAGS) -I./mpich/include -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||
|
||||
$(MPIALIB):
|
||||
ar cr $@ $+
|
||||
|
||||
$(MPISLIB) :
|
||||
$(MPICXX) $(CFLAGS) -I./mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) \
|
||||
$(LDFLAGS) -L./mpich/lib -Wl,-rpath,./mpich/lib -lmpi
|
||||
|
||||
lint:
|
||||
$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include
|
||||
|
||||
@ -113,4 +122,4 @@ doc doxygen:
|
||||
cd include; doxygen ../doc/Doxyfile; cd -
|
||||
|
||||
clean:
|
||||
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~
|
||||
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~
|
||||
|
||||
@ -1 +1 @@
|
||||
Subproject commit 15362f8fcc7345d60de13a676a2cbd3ffdc3f064
|
||||
Subproject commit 13d5acb8ba7e79550bbf2f730f1a3944ff0fa68b
|
||||
@ -4,7 +4,7 @@ Reliable Allreduce and Broadcast Library.
|
||||
Author: Tianqi Chen
|
||||
"""
|
||||
# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
|
||||
import cPickle as pickle
|
||||
import pickle
|
||||
import ctypes
|
||||
import os
|
||||
import sys
|
||||
@ -99,9 +99,10 @@ def init(args=None, lib='standard', lib_dll=None):
|
||||
When this is presented argument lib will be ignored.
|
||||
"""
|
||||
if args is None:
|
||||
args = sys.argv
|
||||
args = []
|
||||
_loadlib(lib, lib_dll)
|
||||
arr = (ctypes.c_char_p * len(args))()
|
||||
|
||||
arr[:] = args
|
||||
_LIB.RabitInit(len(args), arr)
|
||||
|
||||
|
||||
27
scripts/mpi_build.sh
Executable file
27
scripts/mpi_build.sh
Executable file
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if [ -f mpich/lib/libmpich.so ]; then
|
||||
echo "libmpich.so found -- nothing to build."
|
||||
else
|
||||
echo "Downloading mpich source."
|
||||
wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz
|
||||
tar xfz mpich-3.2.tar.gz
|
||||
rm mpich-3.2.tar.gz*
|
||||
echo "configuring and building mpich."
|
||||
cd mpich-3.2
|
||||
#CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64
|
||||
./configure \
|
||||
--prefix=`pwd`/../mpich \
|
||||
--enable-static=false \
|
||||
--enable-alloca=true \
|
||||
--disable-long-double \
|
||||
--enable-threads=single \
|
||||
--enable-fortran=no \
|
||||
--enable-fast=all \
|
||||
--enable-g=none \
|
||||
--enable-timing=none \
|
||||
--enable-cxx
|
||||
make -j4
|
||||
make install
|
||||
cd -
|
||||
fi
|
||||
@ -7,3 +7,4 @@ make -f test.mk local_recover_10_10k || exit -1
|
||||
make -f test.mk lazy_recover_10_10k_die_hard || exit -1
|
||||
make -f test.mk lazy_recover_10_10k_die_same || exit -1
|
||||
make -f test.mk ringallreduce_10_10k || exit -1
|
||||
make -f test.mk pylocal_recover_10_10k || exit -1
|
||||
|
||||
@ -15,6 +15,7 @@ if [ ${TASK} == "build" ]; then
|
||||
fi
|
||||
|
||||
if [ ${TASK} == "mpi-build" ]; then
|
||||
./scripts/mpi_build.sh
|
||||
cd test
|
||||
make mpi && make speed_test.mpi || exit -1
|
||||
fi
|
||||
|
||||
@ -131,6 +131,8 @@ void AllreduceBase::Shutdown(void) {
|
||||
utils::TCPSocket tracker = this->ConnectTracker();
|
||||
tracker.SendStr(std::string("shutdown"));
|
||||
tracker.Close();
|
||||
// close listening sockets
|
||||
sock_listen.Close();
|
||||
utils::TCPSocket::Finalize();
|
||||
}
|
||||
void AllreduceBase::TrackerPrint(const std::string &msg) {
|
||||
@ -271,12 +273,26 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
||||
"ReConnectLink failure 4");
|
||||
Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank),
|
||||
"ReConnectLink failure 4");
|
||||
// create listening socket
|
||||
utils::TCPSocket sock_listen;
|
||||
sock_listen.Create();
|
||||
int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial);
|
||||
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
|
||||
sock_listen.Listen();
|
||||
|
||||
if (sock_listen == INVALID_SOCKET || sock_listen.AtMark()) {
|
||||
if (!sock_listen.IsClosed()) {
|
||||
sock_listen.Close();
|
||||
}
|
||||
// create listening socket
|
||||
sock_listen.Create();
|
||||
sock_listen.SetKeepAlive(true);
|
||||
// http://deepix.github.io/2016/10/21/tcprst.html
|
||||
sock_listen.SetLinger(0);
|
||||
// [slave_port, slave_port+1 .... slave_port + newrank ...slave_port + nport_trial)
|
||||
// work around processes bind to same port without set reuse option,
|
||||
// start explore from slave_port + newrank towards end
|
||||
port = sock_listen.TryBindHost(slave_port+ newrank%nport_trial, slave_port + nport_trial);
|
||||
// if no port bindable, explore first half of range
|
||||
if (port == -1) sock_listen.TryBindHost(slave_port, newrank% nport_trial + slave_port);
|
||||
|
||||
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
|
||||
sock_listen.Listen();
|
||||
}
|
||||
|
||||
// get number of to connect and number of to accept nodes from tracker
|
||||
int num_conn, num_accept, num_error = 1;
|
||||
@ -311,6 +327,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
||||
"ReConnectLink failure 9");
|
||||
Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank),
|
||||
"ReConnectLink failure 10");
|
||||
|
||||
r.sock.Create();
|
||||
if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
|
||||
num_error += 1; r.sock.Close(); continue;
|
||||
@ -357,8 +374,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
||||
}
|
||||
if (!match) all_links.push_back(r);
|
||||
}
|
||||
// close listening sockets
|
||||
sock_listen.Close();
|
||||
|
||||
this->parent_index = -1;
|
||||
// setup tree links and ring structure
|
||||
tree_links.plinks.clear();
|
||||
|
||||
@ -521,6 +521,10 @@ class AllreduceBase : public IEngine {
|
||||
int world_size;
|
||||
// connect retry time
|
||||
int connect_retry;
|
||||
// backdoor listening peer connection
|
||||
utils::TCPSocket sock_listen;
|
||||
// backdoor port
|
||||
int port = 0;
|
||||
};
|
||||
} // namespace engine
|
||||
} // namespace rabit
|
||||
|
||||
@ -51,12 +51,10 @@ void AllreduceRobust::Shutdown(void) {
|
||||
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
|
||||
"Shutdown: check ack must return true");
|
||||
|
||||
// one worker shutdowns and closes sockets while rest still run kCheckAck,
|
||||
// seems has something to do with time-wait state in tcp connection,
|
||||
// this cause rest workers checkandrecover and hang inf,
|
||||
// https://github.com/dmlc/xgboost/pull/3818
|
||||
// TODO(Chen Qin): a fundamental fix for this
|
||||
sleep(1);
|
||||
#if defined (__APPLE__)
|
||||
sleep(1);
|
||||
#endif
|
||||
|
||||
AllreduceBase::Shutdown();
|
||||
}
|
||||
/*!
|
||||
|
||||
10
src/socket.h
10
src/socket.h
@ -276,13 +276,21 @@ class TCPSocket : public Socket{
|
||||
* \brief enable/disable TCP keepalive
|
||||
* \param keepalive whether to set the keep alive option on
|
||||
*/
|
||||
inline void SetKeepAlive(bool keepalive) {
|
||||
void SetKeepAlive(bool keepalive) {
|
||||
int opt = static_cast<int>(keepalive);
|
||||
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
|
||||
reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
|
||||
Socket::Error("SetKeepAlive");
|
||||
}
|
||||
}
|
||||
inline void SetLinger(int timeout = 0) {
|
||||
struct linger sl;
|
||||
sl.l_onoff = 1; /* non-zero value enables linger option in kernel */
|
||||
sl.l_linger = timeout; /* timeout interval in seconds */
|
||||
if (setsockopt(sockfd, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)) == -1) {
|
||||
Socket::Error("SO_LINGER");
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief create the socket, call this before using socket
|
||||
* \param af domain
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
export MPICXX = mpicxx
|
||||
MPICXX=../mpich/bin/mpicxx
|
||||
export LDFLAGS= -L../lib -pthread -lm
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11
|
||||
|
||||
@ -27,11 +27,17 @@ OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
|
||||
MPIBIN = speed_test.mpi
|
||||
.PHONY: clean all lib mpi
|
||||
|
||||
.PHONY: lib all
|
||||
|
||||
all: $(BIN)
|
||||
|
||||
lib:
|
||||
cd ..;make;cd -
|
||||
cd ..;make clean;make;cd -
|
||||
|
||||
.PHONY: mpi
|
||||
mpi:
|
||||
cd ..;make mpi;cd -
|
||||
|
||||
# programs
|
||||
speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
|
||||
model_recover.o: model_recover.cc ../include/rabit/*.h lib
|
||||
@ -52,7 +58,8 @@ $(OBJ) :
|
||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||
|
||||
$(MPIBIN) :
|
||||
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mpi.so $(LDFLAGS)
|
||||
$(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \
|
||||
../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi
|
||||
|
||||
clean:
|
||||
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
|
||||
|
||||
@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
|
||||
TestSum(&model, ntrial, r);
|
||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
||||
rabit::LazyCheckPoint(&model);
|
||||
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
|
||||
printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
|
||||
}
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
|
||||
@ -130,7 +130,7 @@ int main(int argc, char *argv[]) {
|
||||
TestSum(&model, &local, ntrial, r);
|
||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
||||
rabit::CheckPoint(&model, &local);
|
||||
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
|
||||
printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
|
||||
}
|
||||
rabit::Finalize();
|
||||
return 0;
|
||||
|
||||
@ -1,6 +1,11 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from __future__ import print_function
|
||||
from builtins import range
|
||||
|
||||
import sys
|
||||
sys.path.append('../python')
|
||||
|
||||
import rabit
|
||||
import numpy as np
|
||||
|
||||
|
||||
18
test/test.mk
18
test/test.mk
@ -1,29 +1,29 @@
|
||||
# this is a makefile used to show testcases of rabit
|
||||
.PHONY: all
|
||||
|
||||
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k
|
||||
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k
|
||||
|
||||
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
||||
model_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||
|
||||
model_recover_10_10k_die_same:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
|
||||
model_recover_10_10k_die_hard:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
|
||||
local_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
|
||||
pylocal_recover_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||
|
||||
lazy_recover_10_10k_die_hard:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||
|
||||
lazy_recover_10_10k_die_same:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||
|
||||
ringallreduce_10_10k:
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10
|
||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user