remove is_bootstrap parameter (#102)

* apply openmp simd

* clean __buildin detection, moving windows build check from xgboost project, add openmp support for vectorize reduce

* apply openmp only to rabit

* orgnize rabit signature

* remove is_bootstrap, use load_checkpoint as implict flag

* visual studio don't support latest openmp

* orgnize omp declarations

* replace memory copy with vector cast

* Revert "replace memory copy with vector cast"

This reverts commit 28de4792dcdff40d83d458510d23b7ef0b191d79.

* Revert "orgnize omp declarations"

This reverts commit 31341233d31ce93ccf34d700262b1f3f6690bbfe.

* remove openmp settings, merge into a upcoming pr

* mis

* per feedback, update comments
This commit is contained in:
Chen Qin 2019-09-10 11:45:50 -07:00 committed by Nan Zhu
parent 5797dcb64e
commit 9a7ac85d7e
13 changed files with 56 additions and 137 deletions

1
.gitignore vendored
View File

@ -23,6 +23,7 @@
*.lib
# Executables
*.miss
*.exe
*.out
*.app

View File

@ -12,16 +12,23 @@ endif()
option(RABIT_BUILD_TESTS "Build rabit tests" OFF)
option(RABIT_BUILD_MPI "Build MPI" OFF)
option(RABIT_BUILD_DMLC "Include DMLC_CORE in build" ON)
option(RABIT_BUILD_DMLC "Include DMLC_CORE in build" OFF)
add_library(rabit src/allreduce_base.cc src/allreduce_robust.cc src/engine.cc src/c_api.cc)
add_library(rabit_base src/allreduce_base.cc src/engine_base.cc src/c_api.cc)
add_library(rabit_empty src/engine_empty.cc src/c_api.cc)
add_library(rabit_mock_static src/allreduce_base.cc src/allreduce_robust.cc src/engine_mock.cc src/c_api.cc)
add_library(rabit_mock SHARED src/allreduce_base.cc src/allreduce_robust.cc src/engine_mock.cc src/c_api.cc)
# moved from xgboost build
if(R_LIB OR MINGW OR WIN32)
add_library(rabit_base src/allreduce_base.cc src/engine_base.cc src/c_api.cc)
set(rabit_libs rabit)
set_target_properties(rabit PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON)
ELSE()
add_library(rabit src/allreduce_base.cc src/allreduce_robust.cc src/engine.cc src/c_api.cc)
add_library(rabit_base src/allreduce_base.cc src/engine_base.cc src/c_api.cc)
add_library(rabit_empty src/engine_empty.cc src/c_api.cc)
add_library(rabit_mock_static src/allreduce_base.cc src/allreduce_robust.cc src/engine_mock.cc src/c_api.cc)
add_library(rabit_mock SHARED src/allreduce_base.cc src/allreduce_robust.cc src/engine_mock.cc src/c_api.cc)
set(rabit_libs rabit rabit_base rabit_empty rabit_mock rabit_mock_static)
set_target_properties(rabit rabit_base rabit_empty rabit_mock rabit_mock_static PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON)
set(rabit_libs rabit rabit_base rabit_empty rabit_mock rabit_mock_static)
set_target_properties(rabit rabit_base rabit_empty rabit_mock rabit_mock_static PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON)
ENDIF(R_LIB OR MINGW OR WIN32)
if(RABIT_BUILD_MPI)
find_package(MPI REQUIRED)

View File

@ -22,6 +22,10 @@ All these features comes from the facts about small rabbit:)
- Programs persist over all the iterations, unless they fail and recover.
* Reliable: rabit dig burrows to avoid disasters
- Rabit programs can recover the model and results using synchronous function calls.
- Rabit programs can set rabit_boostrap_cache=1 to support allreduce/broadcast operations before loadcheckpoint
`
rabit::Init(); -> rabit::AllReduce(); -> rabit::loadCheckpoint(); -> for () { rabit::AllReduce(); rabit::Checkpoint();} -> rabit::Shutdown();
`
## Use Rabit
* Type make in the root folder will compile the rabit library in lib folder

View File

@ -9,39 +9,15 @@
#include <string>
#include "../serializable.h"
// keeps rabit api caller signature
#ifndef RABIT_API_CALLER_SIGNATURE
#define RABIT_API_CALLER_SIGNATURE
#ifdef __has_builtin
#if __has_builtin(__builtin_FILE)
#if (defined(__GNUC__) && !defined(__clang__))
#define _FILE __builtin_FILE()
#else
#define _FILE "N/A"
#endif // __has_builtin(__builtin_FILE)
#if __has_builtin(__builtin_LINE)
#define _LINE __builtin_LINE()
#else
#define _LINE -1
#endif // __has_builtin(__builtin_LINE)
#if __has_builtin(__builtin_FUNCTION)
#define _CALLER __builtin_FUNCTION()
#else
#define _CALLER "N/A"
#endif // __has_builtin(__builtin_FUNCTION)
#else
#define _FILE "N/A"
#define _LINE -1
#define _CALLER "N/A"
#endif // __has_builtin
#endif // RABIT_API_CALLER_SIGNATURE
#endif // (defined(__GNUC__) && !defined(__clang__))
namespace MPI {
/*! \brief MPI data type just to be compatible with MPI reduce function*/
@ -88,7 +64,6 @@ class IEngine {
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to pass into the lazy preprocessing function
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -99,7 +74,6 @@ class IEngine {
ReduceFunction reducer,
PreprocFunction prepare_fun = NULL,
void *prepare_arg = NULL,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER) = 0;
@ -108,13 +82,11 @@ class IEngine {
* \param sendrecvbuf_ buffer for both sending and receiving data
* \param size the size of the data to be broadcasted
* \param root the root worker id to broadcast the data
* \param is_bootstrap if this broadcast is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
*/
virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER) = 0;
@ -254,7 +226,6 @@ enum DataType {
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf_.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to pass into the lazy preprocessing function.
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -267,7 +238,6 @@ void Allreduce_(void *sendrecvbuf,
mpi::OpType op,
IEngine::PreprocFunction prepare_fun = NULL,
void *prepare_arg = NULL,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -296,7 +266,6 @@ class ReduceHandle {
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to pass into the lazy preprocessing function
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -306,7 +275,6 @@ class ReduceHandle {
size_t count,
IEngine::PreprocFunction prepare_fun = NULL,
void *prepare_arg = NULL,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);

View File

@ -94,10 +94,9 @@ struct BitOR {
};
template<typename OP, typename DType>
inline void Reducer(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
const DType *src = (const DType*)src_;
DType *dst = (DType*)dst_; // NOLINT(*)
for (int i = 0; i < len; ++i) {
const DType* src = (const DType*)src_;
DType* dst = (DType*)dst_; // NOLINT(*)
for (int i = 0; i < len; i++) {
OP::Reduce(dst[i], src[i]);
}
}
@ -129,42 +128,39 @@ inline std::string GetProcessorName(void) {
}
// broadcast data to all other nodes from root
inline void Broadcast(void *sendrecv_data, size_t size, int root,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
engine::GetEngine()->Broadcast(sendrecv_data, size, root,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
}
template<typename DType>
inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
size_t size = sendrecv_data->size();
Broadcast(&size, sizeof(size), root, is_bootstrap, _file, _line, _caller);
Broadcast(&size, sizeof(size), root, _file, _line, _caller);
if (sendrecv_data->size() != size) {
sendrecv_data->resize(size);
}
if (size != 0) {
Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
}
}
inline void Broadcast(std::string *sendrecv_data, int root,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
size_t size = sendrecv_data->length();
Broadcast(&size, sizeof(size), root, is_bootstrap, _file, _line, _caller);
Broadcast(&size, sizeof(size), root, _file, _line, _caller);
if (sendrecv_data->length() != size) {
sendrecv_data->resize(size);
}
if (size != 0) {
Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
}
}
@ -173,13 +169,12 @@ template<typename OP, typename DType>
inline void Allreduce(DType *sendrecvbuf, size_t count,
void (*prepare_fun)(void *arg),
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
}
// C++11 support for lambda prepare function
@ -190,13 +185,12 @@ inline void InvokeLambda_(void *fun) {
template<typename OP, typename DType>
inline void Allreduce(DType *sendrecvbuf, size_t count,
std::function<void()> prepare_fun,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
engine::mpi::GetType<DType>(), OP::kType, InvokeLambda_, &prepare_fun,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
}
#endif // C++11
@ -244,11 +238,12 @@ inline void ReducerSafe_(const void *src_, void *dst_, int len_, const MPI::Data
const size_t kUnit = sizeof(DType);
const char *psrc = reinterpret_cast<const char*>(src_);
char *pdst = reinterpret_cast<char*>(dst_);
for (int i = 0; i < len_; ++i) {
DType tdst, tsrc;
// use memcpy to avoid alignment issue
std::memcpy(&tdst, pdst + i * kUnit, sizeof(tdst));
std::memcpy(&tsrc, psrc + i * kUnit, sizeof(tsrc));
std::memcpy(&tdst, pdst + (i * kUnit), sizeof(tdst));
std::memcpy(&tsrc, psrc + (i * kUnit), sizeof(tsrc));
freduce(tdst, tsrc);
std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));
}
@ -276,12 +271,11 @@ template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLIN
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
void (*prepare_fun)(void *arg),
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun,
prepare_arg, is_bootstrap, _file, _line, _caller);
prepare_arg, _file, _line, _caller);
}
// function to perform reduction for SerializeReducer
template<typename DType>
@ -330,7 +324,6 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
size_t max_nbyte, size_t count,
void (*prepare_fun)(void *arg),
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
@ -342,7 +335,7 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
// invoke here
handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
SerializeReduceClosure<DType>::Invoke, &c,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
for (size_t i = 0; i < count; ++i) {
utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
sendrecvobj[i].Load(fs);
@ -353,23 +346,21 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)g
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
std::function<void()> prepare_fun,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
this->Allreduce(sendrecvbuf, count, InvokeLambda_, &prepare_fun,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
}
template<typename DType>
inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
size_t max_nbytes, size_t count,
std::function<void()> prepare_fun,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda_, &prepare_fun,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
}
#endif // DMLC_USE_CXX11
} // namespace rabit

View File

@ -26,33 +26,15 @@
#ifndef RABIT_API_CALLER_SIGNATURE
#define RABIT_API_CALLER_SIGNATURE
#ifdef __has_builtin
#if __has_builtin(__builtin_FILE)
#if (defined(__GNUC__) && !defined(__clang__))
#define _FILE __builtin_FILE()
#else
#define _FILE "N/A"
#endif // __has_builtin(__builtin_FILE)
#if __has_builtin(__builtin_LINE)
#define _LINE __builtin_LINE()
#else
#define _LINE -1
#endif // __has_builtin(__builtin_LINE)
#if __has_builtin(__builtin_FUNCTION)
#define _CALLER __builtin_FUNCTION()
#else
#define _CALLER "N/A"
#endif // __has_builtin(__builtin_FUNCTION)
#else
#define _FILE "N/A"
#define _LINE -1
#define _CALLER "N/A"
#endif // __has_builtin
#endif // (defined(__GNUC__) && !defined(__clang__))
#endif // RABIT_API_CALLER_SIGNATURE
@ -153,13 +135,11 @@ inline void TrackerPrintf(const char *fmt, ...);
* \param sendrecv_data the pointer to the send/receive buffer,
* \param size the data size
* \param root the process root
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
*/
inline void Broadcast(void *sendrecv_data, size_t size, int root,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -169,7 +149,6 @@ inline void Broadcast(void *sendrecv_data, size_t size, int root,
* \param sendrecv_data the pointer to send/receive vector,
* for the receiver, the vector does not need to be pre-allocated
* \param root the process root
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -178,7 +157,6 @@ inline void Broadcast(void *sendrecv_data, size_t size, int root,
*/
template<typename DType>
inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -186,14 +164,12 @@ inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
* \brief broadcasts a std::string to every node from the root
* \param sendrecv_data the pointer to the send/receive buffer,
* for the receiver, the vector does not need to be pre-allocated
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
* \param root the process root
*/
inline void Broadcast(std::string *sendrecv_data, int root,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -215,7 +191,6 @@ inline void Broadcast(std::string *sendrecv_data, int root,
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to pass into the lazy preprocessing function
* \param is_bootstrap if this allreduce is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -226,7 +201,6 @@ template<typename OP, typename DType>
inline void Allreduce(DType *sendrecvbuf, size_t count,
void (*prepare_fun)(void *) = NULL,
void *prepare_arg = NULL,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -254,7 +228,6 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
* \param prepare_fun Lazy lambda preprocessing function, prepare_fun() will be invoked
* by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -264,7 +237,6 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
template<typename OP, typename DType>
inline void Allreduce(DType *sendrecvbuf, size_t count,
std::function<void()> prepare_fun,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -363,7 +335,6 @@ class Reducer {
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to pass into the lazy preprocessing function
* \param is_bootstrap if this allreduce is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -371,7 +342,6 @@ class Reducer {
inline void Allreduce(DType *sendrecvbuf, size_t count,
void (*prepare_fun)(void *) = NULL,
void *prepare_arg = NULL,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -381,14 +351,12 @@ class Reducer {
* \param sendrecvbuf pointer to the array of objects to be reduced
* \param count number of elements to be reduced
* \param prepare_fun lambda function executed to prepare the data, if necessary
* \param is_bootstrap if this allreduce is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
*/
inline void Allreduce(DType *sendrecvbuf, size_t count,
std::function<void()> prepare_fun,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -422,7 +390,6 @@ class SerializeReducer {
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
* If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
* \param prepare_arg argument used to pass into the lazy preprocessing function
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -431,7 +398,6 @@ class SerializeReducer {
size_t max_nbyte, size_t count,
void (*prepare_fun)(void *) = NULL,
void *prepare_arg = NULL,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -444,7 +410,6 @@ class SerializeReducer {
* this includes budget limit for intermediate and final result
* \param count number of elements to be reduced
* \param prepare_fun lambda function executed to prepare the data, if necessary
* \param is_bootstrap if this allreduce is needed to bootstrap failed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -452,7 +417,6 @@ class SerializeReducer {
inline void Allreduce(DType *sendrecvobj,
size_t max_nbyte, size_t count,
std::function<void()> prepare_fun,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);

View File

@ -83,7 +83,6 @@ class AllreduceBase : public IEngine {
* will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to passed into the lazy preprocessing function
* \param is_bootstrap if this allreduce is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -94,7 +93,6 @@ class AllreduceBase : public IEngine {
ReduceFunction reducer,
PreprocFunction prepare_fun = NULL,
void *prepare_arg = NULL,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER) {
@ -109,14 +107,12 @@ class AllreduceBase : public IEngine {
* \param sendrecvbuf_ buffer for both sending and recving data
* \param size the size of the data to be broadcasted
* \param root the root worker id to broadcast the data
* \param is_bootstrap if this broadcast is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
*/
virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
bool is_bootstrap = false, const char* _file = _FILE,
const int _line = _LINE, const char* _caller = _CALLER) {
const char* _file = _FILE, const int _line = _LINE, const char* _caller = _CALLER) {
if (world_size == 1 || world_size == -1) return;
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
"Broadcast failed");

View File

@ -30,6 +30,7 @@ AllreduceRobust::AllreduceRobust(void) {
global_lazycheck = NULL;
use_local_model = -1;
recover_counter = 0;
checkpoint_loaded = false;
env_vars.push_back("rabit_global_replica");
env_vars.push_back("rabit_local_replica");
}
@ -38,6 +39,7 @@ bool AllreduceRobust::Init(int argc, char* argv[]) {
// chenqin: alert user opted in experimental feature.
if (rabit_bootstrap_cache) utils::HandleLogInfo(
"[EXPERIMENTAL] rabit bootstrap cache has been enabled\n");
checkpoint_loaded = false;
if (num_global_replica == 0) {
result_buffer_round = -1;
} else {
@ -157,7 +159,6 @@ int AllreduceRobust::GetBootstrapCache(const std::string &key, void* buf,
* will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to passed into the lazy preprocessing function
* \param is_bootstrap if this allreduce is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -168,7 +169,6 @@ void AllreduceRobust::Allreduce(void *sendrecvbuf_,
ReduceFunction reducer,
PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
@ -183,7 +183,7 @@ void AllreduceRobust::Allreduce(void *sendrecvbuf_,
+ std::string(_caller) + "#" +std::to_string(type_nbytes) + "x" + std::to_string(count);
// try fetch bootstrap allreduce results from cache
if (is_bootstrap && rabit_bootstrap_cache &&
if (!checkpoint_loaded && rabit_bootstrap_cache &&
GetBootstrapCache(key, sendrecvbuf_, type_nbytes, count, true) != -1) return;
double start = utils::GetTime();
@ -217,7 +217,7 @@ void AllreduceRobust::Allreduce(void *sendrecvbuf_,
}
// if bootstrap allreduce, store and fetch through cache
if (!is_bootstrap || !rabit_bootstrap_cache) {
if (checkpoint_loaded || !rabit_bootstrap_cache) {
resbuf.PushTemp(seq_counter, type_nbytes, count);
seq_counter += 1;
} else {
@ -229,13 +229,11 @@ void AllreduceRobust::Allreduce(void *sendrecvbuf_,
* \param sendrecvbuf_ buffer for both sending and recving data
* \param size the size of the data to be broadcasted
* \param root the root worker id to broadcast the data
* \param is_bootstrap if this allreduce is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
*/
void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
@ -245,7 +243,7 @@ void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root,
std::string key = std::string(_file) + "::" + std::to_string(_line) + "::"
+ std::string(_caller) + "#" +std::to_string(total_size) + "@" + std::to_string(root);
// try fetch bootstrap allreduce results from cache
if (is_bootstrap && rabit_bootstrap_cache &&
if (!checkpoint_loaded && rabit_bootstrap_cache &&
GetBootstrapCache(key, sendrecvbuf_, total_size, 1, true) != -1) return;
double start = utils::GetTime();
@ -277,7 +275,7 @@ void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root,
rank, key.c_str(), root, version_number, seq_counter, delta);
}
// if bootstrap broadcast, store and fetch through cache
if (!is_bootstrap || !rabit_bootstrap_cache) {
if (checkpoint_loaded || !rabit_bootstrap_cache) {
resbuf.PushTemp(seq_counter, 1, total_size);
seq_counter += 1;
} else {
@ -308,6 +306,7 @@ void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root,
*/
int AllreduceRobust::LoadCheckPoint(Serializable *global_model,
Serializable *local_model) {
checkpoint_loaded = true;
// skip action in single node
if (world_size == 1) return 0;
this->LocalModelCheck(local_model != NULL);

View File

@ -62,7 +62,6 @@ class AllreduceRobust : public AllreduceBase {
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
* \param prepare_arg argument used to passed into the lazy preprocessing function
* \param prepare_arg argument used to passed into the lazy preprocessing function
* \param is_bootstrap if this allreduce is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
@ -73,7 +72,6 @@ class AllreduceRobust : public AllreduceBase {
ReduceFunction reducer,
PreprocFunction prepare_fun = NULL,
void *prepare_arg = NULL,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -82,13 +80,11 @@ class AllreduceRobust : public AllreduceBase {
* \param sendrecvbuf_ buffer for both sending and recving data
* \param size the size of the data to be broadcasted
* \param root the root worker id to broadcast the data
* \param is_bootstrap if this broadcast is needed to bootstrap filed node
* \param _file caller file name used to generate unique cache key
* \param _line caller line number used to generate unique cache key
* \param _caller caller function name used to generate unique cache key
*/
virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
bool is_bootstrap = false,
const char* _file = _FILE,
const int _line = _LINE,
const char* _caller = _CALLER);
@ -643,6 +639,8 @@ o * the input state must exactly one saved state(local state of current node)
std::string local_chkpt[2];
// version of local checkpoint can be 1 or 0
int local_chkpt_version;
// if checkpoint were loaded, used to distinguish results boostrap cache from seqno cache
bool checkpoint_loaded;
};
} // namespace engine
} // namespace rabit

View File

@ -93,12 +93,11 @@ void Allreduce_(void *sendrecvbuf,
mpi::OpType op,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count, red, prepare_fun,
prepare_arg, is_bootstrap, _file, _line, _caller);
prepare_arg, _file, _line, _caller);
}
// code for reduce handle
@ -121,14 +120,13 @@ void ReduceHandle::Allreduce(void *sendrecvbuf,
size_t type_nbytes, size_t count,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
utils::Assert(redfunc_ != NULL, "must intialize handle to call AllReduce");
GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
redfunc_, prepare_fun, prepare_arg,
is_bootstrap, _file, _line, _caller);
_file, _line, _caller);
}
} // namespace engine
} // namespace rabit

View File

@ -31,7 +31,6 @@ class EmptyEngine : public IEngine {
ReduceFunction reducer,
PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
@ -39,8 +38,7 @@ class EmptyEngine : public IEngine {
"use Allreduce_ instead");
}
virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
bool is_bootstrap, const char* _file,
const int _line, const char* _caller) {
const char* _file, const int _line, const char* _caller) {
}
virtual void InitAfterException(void) {
utils::Error("EmptyEngine is not fault tolerant");
@ -109,7 +107,6 @@ void Allreduce_(void *sendrecvbuf,
mpi::OpType op,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
@ -129,7 +126,6 @@ void ReduceHandle::Allreduce(void *sendrecvbuf,
size_t type_nbytes, size_t count,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {

View File

@ -33,7 +33,6 @@ class MPIEngine : public IEngine {
ReduceFunction reducer,
PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
@ -41,7 +40,7 @@ class MPIEngine : public IEngine {
"use Allreduce_ instead");
}
virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
bool is_bootstrap, const char* _file, const int _line,
const char* _file, const int _line,
const char* _caller) {
MPI::COMM_WORLD.Bcast(sendrecvbuf_, size, MPI::CHAR, root);
}
@ -160,7 +159,6 @@ void Allreduce_(void *sendrecvbuf,
mpi::OpType op,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {
@ -212,7 +210,6 @@ void ReduceHandle::Allreduce(void *sendrecvbuf,
size_t type_nbytes, size_t count,
IEngine::PreprocFunction prepare_fun,
void *prepare_arg,
bool is_bootstrap,
const char* _file,
const int _line,
const char* _caller) {

View File

@ -96,7 +96,7 @@ int main(int argc, char *argv[]) {
std::string name = rabit::GetProcessorName();
int max_rank = rank;
rabit::Allreduce<op::Max>(&max_rank, sizeof(int), NULL, NULL, true);
rabit::Allreduce<op::Max>(&max_rank, sizeof(int));
utils::Check(max_rank == nproc - 1, "max rank is world size-1");
Model model;