[rabit harden] Enable all tests (#90)
* include osx in tests * address `time_wait` on port assignment * increase submit attempts. * cleanup tests
This commit is contained in:
parent
ecd4bf7aae
commit
e3d51d3e62
6
.gitignore
vendored
6
.gitignore
vendored
@ -37,8 +37,12 @@ recommonmark
|
|||||||
recom
|
recom
|
||||||
_*
|
_*
|
||||||
|
|
||||||
|
#mpi lib
|
||||||
|
mpich/
|
||||||
|
mpich-3.2/
|
||||||
|
|
||||||
# Jetbrain
|
# Jetbrain
|
||||||
.idea
|
.idea
|
||||||
cmake-build-debug/
|
cmake-build-debug/
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
|||||||
51
.travis.yml
51
.travis.yml
@ -1,5 +1,12 @@
|
|||||||
# disable sudo to use container based build
|
sudo: true
|
||||||
sudo: false
|
|
||||||
|
os:
|
||||||
|
- linux
|
||||||
|
- osx
|
||||||
|
|
||||||
|
osx_image: xcode10.2
|
||||||
|
|
||||||
|
dist: xenial
|
||||||
|
|
||||||
# Use Build Matrix to do lint and build seperately
|
# Use Build Matrix to do lint and build seperately
|
||||||
env:
|
env:
|
||||||
@ -10,15 +17,17 @@ env:
|
|||||||
- TASK=build
|
- TASK=build
|
||||||
- TASK=mpi-build
|
- TASK=mpi-build
|
||||||
- TASK=cmake-build
|
- TASK=cmake-build
|
||||||
- TASK=test CXX=g++-4.8
|
- TASK=test CXX=g++
|
||||||
|
|
||||||
# dependent apt packages
|
# dependent apt packages
|
||||||
dist: xenial
|
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
|
sources:
|
||||||
|
- llvm-toolchain-trusty-5.0
|
||||||
|
- ubuntu-toolchain-r-test
|
||||||
|
- george-edison55-precise-backports
|
||||||
packages:
|
packages:
|
||||||
- doxygen
|
- doxygen
|
||||||
- libopenmpi-dev
|
|
||||||
- wget
|
- wget
|
||||||
- git
|
- git
|
||||||
- libcurl4-openssl-dev
|
- libcurl4-openssl-dev
|
||||||
@ -26,30 +35,43 @@ addons:
|
|||||||
- python-numpy
|
- python-numpy
|
||||||
- gcc-4.8
|
- gcc-4.8
|
||||||
- g++-4.8
|
- g++-4.8
|
||||||
- openmpi-bin
|
|
||||||
- openmpi-common
|
|
||||||
- openssh-client
|
- openssh-client
|
||||||
- openssh-server
|
- openssh-server
|
||||||
- libopenmpi-dev
|
- python3
|
||||||
|
- python3-setuptools
|
||||||
|
homebrew:
|
||||||
|
packages:
|
||||||
|
- gcc49
|
||||||
|
- openssl
|
||||||
|
- libgit2
|
||||||
|
- python3
|
||||||
|
update: true
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- export TRAVIS=dmlc-core/scripts/travis/
|
- export TRAVIS=dmlc-core/scripts/travis/
|
||||||
- source ${TRAVIS}/travis_setup_env.sh
|
- source ${TRAVIS}/travis_setup_env.sh
|
||||||
|
- ${TRAVIS}/travis_osx_install.sh
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- pip install --user cpplint pylint kubernetes urllib3
|
- if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi
|
||||||
|
- if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then brew install python3; fi
|
||||||
|
- pip3 install cpplint pylint urllib3 numpy
|
||||||
|
- pip3 install websocket-client kubernetes
|
||||||
|
|
||||||
script: scripts/travis_script.sh
|
script: scripts/travis_script.sh
|
||||||
|
|
||||||
|
|
||||||
before_cache:
|
|
||||||
- ${TRAVIS}/travis_before_cache.sh
|
|
||||||
|
|
||||||
|
|
||||||
cache:
|
cache:
|
||||||
directories:
|
directories:
|
||||||
- ${HOME}/.cache/usr
|
- ${HOME}/.cache/usr
|
||||||
|
- ${HOME}/.cache/pip
|
||||||
|
- mpich
|
||||||
|
|
||||||
|
before_cache:
|
||||||
|
- ${TRAVIS}/travis_before_cache.sh
|
||||||
|
|
||||||
|
after_success:
|
||||||
|
- tree build
|
||||||
|
- bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c'
|
||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
# Emails are sent to the committer's git-configured email address by default,
|
# Emails are sent to the committer's git-configured email address by default,
|
||||||
@ -57,4 +79,3 @@ notifications:
|
|||||||
on_success: change
|
on_success: change
|
||||||
on_failure: always
|
on_failure: always
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
43
Makefile
43
Makefile
@ -1,23 +1,26 @@
|
|||||||
OS := $(shell uname)
|
OS := $(shell uname)
|
||||||
|
|
||||||
export MPICXX = mpicxx
|
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
|
||||||
export LDFLAGS= -Llib
|
export CFLAGS = -O3 $(WARNFLAGS) -I $(DMLC)/include -I include/
|
||||||
|
export LDFLAGS =-Llib
|
||||||
|
|
||||||
OS := $(shell uname)
|
#download mpi
|
||||||
|
#echo $(shell scripts/mpi.sh)
|
||||||
|
|
||||||
|
MPICXX=./mpich/bin/mpicxx
|
||||||
|
|
||||||
ifeq ($(OS), Darwin)
|
ifeq ($(OS), Darwin)
|
||||||
ifndef CC
|
ifndef CC
|
||||||
export CC = $(if $(shell which clang), clang, gcc)
|
export CC = gcc-4.9
|
||||||
endif
|
endif
|
||||||
ifndef CXX
|
ifndef CXX
|
||||||
export CXX = $(if $(shell which clang++), clang++, g++)
|
export CXX = g++-4.9
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
ifeq ($(OS), FreeBSD)
|
ifeq ($(OS), FreeBSD)
|
||||||
ifndef CXX
|
ifndef CXX
|
||||||
export CXX = g++6
|
export CXX = g++6
|
||||||
endif
|
endif
|
||||||
export MPICXX = /usr/local/mpi/bin/mpicxx
|
|
||||||
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
|
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
|
||||||
else
|
else
|
||||||
# linux defaults
|
# linux defaults
|
||||||
@ -27,13 +30,10 @@ else
|
|||||||
ifndef CXX
|
ifndef CXX
|
||||||
export CXX = g++
|
export CXX = g++
|
||||||
endif
|
endif
|
||||||
LDFLAGS += -lrt
|
LDFLAGS +=-lrt
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
|
|
||||||
export CFLAGS = -O3 $(WARNFLAGS)
|
|
||||||
|
|
||||||
#----------------------------
|
#----------------------------
|
||||||
# Settings for power and arm arch
|
# Settings for power and arm arch
|
||||||
#----------------------------
|
#----------------------------
|
||||||
@ -69,8 +69,10 @@ BPATH=.
|
|||||||
MPIOBJ= $(BPATH)/engine_mpi.o
|
MPIOBJ= $(BPATH)/engine_mpi.o
|
||||||
OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
|
OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
|
||||||
$(BPATH)/c_api.o $(BPATH)/engine_base.o
|
$(BPATH)/c_api.o $(BPATH)/engine_base.o
|
||||||
SLIB= lib/librabit.so lib/librabit_mpi.so lib/librabit_mock.so lib/librabit_base.so
|
SLIB= lib/librabit.so lib/librabit_mock.so lib/librabit_base.so
|
||||||
ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
|
ALIB= lib/librabit.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
|
||||||
|
MPISLIB= lib/librabit_mpi.so
|
||||||
|
MPIALIB= lib/librabit_mpi.a
|
||||||
HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h
|
HEADERS=src/*.h include/rabit/*.h include/rabit/internal/*.h
|
||||||
DMLC=dmlc-core
|
DMLC=dmlc-core
|
||||||
|
|
||||||
@ -95,10 +97,7 @@ lib/librabit_empty.a: $(BPATH)/engine_empty.o $(BPATH)/c_api.o
|
|||||||
lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ)
|
lib/librabit_mpi.a lib/librabit_mpi.so: $(MPIOBJ)
|
||||||
|
|
||||||
$(OBJ) :
|
$(OBJ) :
|
||||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I include/ -I $(DMLC)/include
|
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||||
|
|
||||||
$(MPIOBJ) :
|
|
||||||
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) ) -I $(DMLC)/include
|
|
||||||
|
|
||||||
$(ALIB):
|
$(ALIB):
|
||||||
ar cr $@ $+
|
ar cr $@ $+
|
||||||
@ -106,6 +105,16 @@ $(ALIB):
|
|||||||
$(SLIB) :
|
$(SLIB) :
|
||||||
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
|
||||||
|
|
||||||
|
$(MPIOBJ) :
|
||||||
|
$(MPICXX) -c $(CFLAGS) -I./mpich/include -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||||
|
|
||||||
|
$(MPIALIB):
|
||||||
|
ar cr $@ $+
|
||||||
|
|
||||||
|
$(MPISLIB) :
|
||||||
|
$(MPICXX) $(CFLAGS) -I./mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) \
|
||||||
|
$(LDFLAGS) -L./mpich/lib -Wl,-rpath,./mpich/lib -lmpi
|
||||||
|
|
||||||
lint:
|
lint:
|
||||||
$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include
|
$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include
|
||||||
|
|
||||||
@ -113,4 +122,4 @@ doc doxygen:
|
|||||||
cd include; doxygen ../doc/Doxyfile; cd -
|
cd include; doxygen ../doc/Doxyfile; cd -
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~
|
$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
Subproject commit 15362f8fcc7345d60de13a676a2cbd3ffdc3f064
|
Subproject commit 13d5acb8ba7e79550bbf2f730f1a3944ff0fa68b
|
||||||
@ -4,7 +4,7 @@ Reliable Allreduce and Broadcast Library.
|
|||||||
Author: Tianqi Chen
|
Author: Tianqi Chen
|
||||||
"""
|
"""
|
||||||
# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
|
# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
|
||||||
import cPickle as pickle
|
import pickle
|
||||||
import ctypes
|
import ctypes
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@ -99,9 +99,10 @@ def init(args=None, lib='standard', lib_dll=None):
|
|||||||
When this is presented argument lib will be ignored.
|
When this is presented argument lib will be ignored.
|
||||||
"""
|
"""
|
||||||
if args is None:
|
if args is None:
|
||||||
args = sys.argv
|
args = []
|
||||||
_loadlib(lib, lib_dll)
|
_loadlib(lib, lib_dll)
|
||||||
arr = (ctypes.c_char_p * len(args))()
|
arr = (ctypes.c_char_p * len(args))()
|
||||||
|
|
||||||
arr[:] = args
|
arr[:] = args
|
||||||
_LIB.RabitInit(len(args), arr)
|
_LIB.RabitInit(len(args), arr)
|
||||||
|
|
||||||
|
|||||||
27
scripts/mpi_build.sh
Executable file
27
scripts/mpi_build.sh
Executable file
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
if [ -f mpich/lib/libmpich.so ]; then
|
||||||
|
echo "libmpich.so found -- nothing to build."
|
||||||
|
else
|
||||||
|
echo "Downloading mpich source."
|
||||||
|
wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz
|
||||||
|
tar xfz mpich-3.2.tar.gz
|
||||||
|
rm mpich-3.2.tar.gz*
|
||||||
|
echo "configuring and building mpich."
|
||||||
|
cd mpich-3.2
|
||||||
|
#CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64
|
||||||
|
./configure \
|
||||||
|
--prefix=`pwd`/../mpich \
|
||||||
|
--enable-static=false \
|
||||||
|
--enable-alloca=true \
|
||||||
|
--disable-long-double \
|
||||||
|
--enable-threads=single \
|
||||||
|
--enable-fortran=no \
|
||||||
|
--enable-fast=all \
|
||||||
|
--enable-g=none \
|
||||||
|
--enable-timing=none \
|
||||||
|
--enable-cxx
|
||||||
|
make -j4
|
||||||
|
make install
|
||||||
|
cd -
|
||||||
|
fi
|
||||||
@ -7,3 +7,4 @@ make -f test.mk local_recover_10_10k || exit -1
|
|||||||
make -f test.mk lazy_recover_10_10k_die_hard || exit -1
|
make -f test.mk lazy_recover_10_10k_die_hard || exit -1
|
||||||
make -f test.mk lazy_recover_10_10k_die_same || exit -1
|
make -f test.mk lazy_recover_10_10k_die_same || exit -1
|
||||||
make -f test.mk ringallreduce_10_10k || exit -1
|
make -f test.mk ringallreduce_10_10k || exit -1
|
||||||
|
make -f test.mk pylocal_recover_10_10k || exit -1
|
||||||
|
|||||||
@ -15,6 +15,7 @@ if [ ${TASK} == "build" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ${TASK} == "mpi-build" ]; then
|
if [ ${TASK} == "mpi-build" ]; then
|
||||||
|
./scripts/mpi_build.sh
|
||||||
cd test
|
cd test
|
||||||
make mpi && make speed_test.mpi || exit -1
|
make mpi && make speed_test.mpi || exit -1
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -131,6 +131,8 @@ void AllreduceBase::Shutdown(void) {
|
|||||||
utils::TCPSocket tracker = this->ConnectTracker();
|
utils::TCPSocket tracker = this->ConnectTracker();
|
||||||
tracker.SendStr(std::string("shutdown"));
|
tracker.SendStr(std::string("shutdown"));
|
||||||
tracker.Close();
|
tracker.Close();
|
||||||
|
// close listening sockets
|
||||||
|
sock_listen.Close();
|
||||||
utils::TCPSocket::Finalize();
|
utils::TCPSocket::Finalize();
|
||||||
}
|
}
|
||||||
void AllreduceBase::TrackerPrint(const std::string &msg) {
|
void AllreduceBase::TrackerPrint(const std::string &msg) {
|
||||||
@ -271,12 +273,26 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
|||||||
"ReConnectLink failure 4");
|
"ReConnectLink failure 4");
|
||||||
Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank),
|
Assert(tracker.RecvAll(&next_rank, sizeof(next_rank)) == sizeof(next_rank),
|
||||||
"ReConnectLink failure 4");
|
"ReConnectLink failure 4");
|
||||||
// create listening socket
|
|
||||||
utils::TCPSocket sock_listen;
|
if (sock_listen == INVALID_SOCKET || sock_listen.AtMark()) {
|
||||||
sock_listen.Create();
|
if (!sock_listen.IsClosed()) {
|
||||||
int port = sock_listen.TryBindHost(slave_port, slave_port + nport_trial);
|
sock_listen.Close();
|
||||||
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
|
}
|
||||||
sock_listen.Listen();
|
// create listening socket
|
||||||
|
sock_listen.Create();
|
||||||
|
sock_listen.SetKeepAlive(true);
|
||||||
|
// http://deepix.github.io/2016/10/21/tcprst.html
|
||||||
|
sock_listen.SetLinger(0);
|
||||||
|
// [slave_port, slave_port+1 .... slave_port + newrank ...slave_port + nport_trial)
|
||||||
|
// work around processes bind to same port without set reuse option,
|
||||||
|
// start explore from slave_port + newrank towards end
|
||||||
|
port = sock_listen.TryBindHost(slave_port+ newrank%nport_trial, slave_port + nport_trial);
|
||||||
|
// if no port bindable, explore first half of range
|
||||||
|
if (port == -1) sock_listen.TryBindHost(slave_port, newrank% nport_trial + slave_port);
|
||||||
|
|
||||||
|
utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
|
||||||
|
sock_listen.Listen();
|
||||||
|
}
|
||||||
|
|
||||||
// get number of to connect and number of to accept nodes from tracker
|
// get number of to connect and number of to accept nodes from tracker
|
||||||
int num_conn, num_accept, num_error = 1;
|
int num_conn, num_accept, num_error = 1;
|
||||||
@ -311,6 +327,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
|||||||
"ReConnectLink failure 9");
|
"ReConnectLink failure 9");
|
||||||
Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank),
|
Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank),
|
||||||
"ReConnectLink failure 10");
|
"ReConnectLink failure 10");
|
||||||
|
|
||||||
r.sock.Create();
|
r.sock.Create();
|
||||||
if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
|
if (!r.sock.Connect(utils::SockAddr(hname.c_str(), hport))) {
|
||||||
num_error += 1; r.sock.Close(); continue;
|
num_error += 1; r.sock.Close(); continue;
|
||||||
@ -357,8 +374,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
|
|||||||
}
|
}
|
||||||
if (!match) all_links.push_back(r);
|
if (!match) all_links.push_back(r);
|
||||||
}
|
}
|
||||||
// close listening sockets
|
|
||||||
sock_listen.Close();
|
|
||||||
this->parent_index = -1;
|
this->parent_index = -1;
|
||||||
// setup tree links and ring structure
|
// setup tree links and ring structure
|
||||||
tree_links.plinks.clear();
|
tree_links.plinks.clear();
|
||||||
|
|||||||
@ -521,6 +521,10 @@ class AllreduceBase : public IEngine {
|
|||||||
int world_size;
|
int world_size;
|
||||||
// connect retry time
|
// connect retry time
|
||||||
int connect_retry;
|
int connect_retry;
|
||||||
|
// backdoor listening peer connection
|
||||||
|
utils::TCPSocket sock_listen;
|
||||||
|
// backdoor port
|
||||||
|
int port = 0;
|
||||||
};
|
};
|
||||||
} // namespace engine
|
} // namespace engine
|
||||||
} // namespace rabit
|
} // namespace rabit
|
||||||
|
|||||||
@ -51,12 +51,10 @@ void AllreduceRobust::Shutdown(void) {
|
|||||||
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
|
utils::Assert(RecoverExec(NULL, 0, ActionSummary::kCheckAck, ActionSummary::kSpecialOp),
|
||||||
"Shutdown: check ack must return true");
|
"Shutdown: check ack must return true");
|
||||||
|
|
||||||
// one worker shutdowns and closes sockets while rest still run kCheckAck,
|
#if defined (__APPLE__)
|
||||||
// seems has something to do with time-wait state in tcp connection,
|
sleep(1);
|
||||||
// this cause rest workers checkandrecover and hang inf,
|
#endif
|
||||||
// https://github.com/dmlc/xgboost/pull/3818
|
|
||||||
// TODO(Chen Qin): a fundamental fix for this
|
|
||||||
sleep(1);
|
|
||||||
AllreduceBase::Shutdown();
|
AllreduceBase::Shutdown();
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
|
|||||||
10
src/socket.h
10
src/socket.h
@ -276,13 +276,21 @@ class TCPSocket : public Socket{
|
|||||||
* \brief enable/disable TCP keepalive
|
* \brief enable/disable TCP keepalive
|
||||||
* \param keepalive whether to set the keep alive option on
|
* \param keepalive whether to set the keep alive option on
|
||||||
*/
|
*/
|
||||||
inline void SetKeepAlive(bool keepalive) {
|
void SetKeepAlive(bool keepalive) {
|
||||||
int opt = static_cast<int>(keepalive);
|
int opt = static_cast<int>(keepalive);
|
||||||
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
|
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
|
||||||
reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
|
reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
|
||||||
Socket::Error("SetKeepAlive");
|
Socket::Error("SetKeepAlive");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
inline void SetLinger(int timeout = 0) {
|
||||||
|
struct linger sl;
|
||||||
|
sl.l_onoff = 1; /* non-zero value enables linger option in kernel */
|
||||||
|
sl.l_linger = timeout; /* timeout interval in seconds */
|
||||||
|
if (setsockopt(sockfd, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)) == -1) {
|
||||||
|
Socket::Error("SO_LINGER");
|
||||||
|
}
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief create the socket, call this before using socket
|
* \brief create the socket, call this before using socket
|
||||||
* \param af domain
|
* \param af domain
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
export MPICXX = mpicxx
|
MPICXX=../mpich/bin/mpicxx
|
||||||
export LDFLAGS= -L../lib -pthread -lm
|
export LDFLAGS= -L../lib -pthread -lm
|
||||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -I ../dmlc-core/include -std=c++11
|
||||||
|
|
||||||
@ -27,11 +27,17 @@ OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
|
|||||||
MPIBIN = speed_test.mpi
|
MPIBIN = speed_test.mpi
|
||||||
.PHONY: clean all lib mpi
|
.PHONY: clean all lib mpi
|
||||||
|
|
||||||
|
.PHONY: lib all
|
||||||
|
|
||||||
all: $(BIN)
|
all: $(BIN)
|
||||||
|
|
||||||
lib:
|
lib:
|
||||||
cd ..;make;cd -
|
cd ..;make clean;make;cd -
|
||||||
|
|
||||||
|
.PHONY: mpi
|
||||||
mpi:
|
mpi:
|
||||||
cd ..;make mpi;cd -
|
cd ..;make mpi;cd -
|
||||||
|
|
||||||
# programs
|
# programs
|
||||||
speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
|
speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
|
||||||
model_recover.o: model_recover.cc ../include/rabit/*.h lib
|
model_recover.o: model_recover.cc ../include/rabit/*.h lib
|
||||||
@ -52,7 +58,8 @@ $(OBJ) :
|
|||||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
||||||
|
|
||||||
$(MPIBIN) :
|
$(MPIBIN) :
|
||||||
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mpi.so $(LDFLAGS)
|
$(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \
|
||||||
|
../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
|
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
|
||||||
|
|||||||
@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
|
|||||||
TestSum(&model, ntrial, r);
|
TestSum(&model, ntrial, r);
|
||||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
||||||
rabit::LazyCheckPoint(&model);
|
rabit::LazyCheckPoint(&model);
|
||||||
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
|
printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
|
||||||
}
|
}
|
||||||
rabit::Finalize();
|
rabit::Finalize();
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@ -130,7 +130,7 @@ int main(int argc, char *argv[]) {
|
|||||||
TestSum(&model, &local, ntrial, r);
|
TestSum(&model, &local, ntrial, r);
|
||||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
||||||
rabit::CheckPoint(&model, &local);
|
rabit::CheckPoint(&model, &local);
|
||||||
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
|
printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
|
||||||
}
|
}
|
||||||
rabit::Finalize();
|
rabit::Finalize();
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@ -1,6 +1,11 @@
|
|||||||
#!/usr/bin/python
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
from builtins import range
|
from builtins import range
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.path.append('../python')
|
||||||
|
|
||||||
import rabit
|
import rabit
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|||||||
18
test/test.mk
18
test/test.mk
@ -1,29 +1,29 @@
|
|||||||
# this is a makefile used to show testcases of rabit
|
# this is a makefile used to show testcases of rabit
|
||||||
.PHONY: all
|
.PHONY: all
|
||||||
|
|
||||||
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k
|
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k
|
||||||
|
|
||||||
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
||||||
model_recover_10_10k:
|
model_recover_10_10k:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
|
||||||
|
|
||||||
model_recover_10_10k_die_same:
|
model_recover_10_10k_die_same:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||||
|
|
||||||
model_recover_10_10k_die_hard:
|
model_recover_10_10k_die_hard:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||||
|
|
||||||
local_recover_10_10k:
|
local_recover_10_10k:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||||
|
|
||||||
pylocal_recover_10_10k:
|
pylocal_recover_10_10k:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
||||||
|
|
||||||
lazy_recover_10_10k_die_hard:
|
lazy_recover_10_10k_die_hard:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
||||||
|
|
||||||
lazy_recover_10_10k_die_same:
|
lazy_recover_10_10k_die_same:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
||||||
|
|
||||||
ringallreduce_10_10k:
|
ringallreduce_10_10k:
|
||||||
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10
|
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user