Cleanup RABIT. (#6290)
* Remove recovery and MPI speed tests. * Remove readme. * Remove Python binding. * Add checks in C API.
This commit is contained in:
parent
8e0f5a6fc7
commit
b180223d18
@ -964,7 +964,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_RabitVersionNumber
|
|||||||
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_RabitAllreduce
|
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_RabitAllreduce
|
||||||
(JNIEnv *jenv, jclass jcls, jobject jsendrecvbuf, jint jcount, jint jenum_dtype, jint jenum_op) {
|
(JNIEnv *jenv, jclass jcls, jobject jsendrecvbuf, jint jcount, jint jenum_dtype, jint jenum_op) {
|
||||||
void *ptr_sendrecvbuf = jenv->GetDirectBufferAddress(jsendrecvbuf);
|
void *ptr_sendrecvbuf = jenv->GetDirectBufferAddress(jsendrecvbuf);
|
||||||
RabitAllreduce(ptr_sendrecvbuf, (size_t) jcount, jenum_dtype, jenum_op, NULL, NULL);
|
JVM_CHECK_CALL(RabitAllreduce(ptr_sendrecvbuf, (size_t) jcount, jenum_dtype, jenum_op, NULL, NULL));
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -120,18 +120,18 @@ def broadcast(data, root):
|
|||||||
s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
|
s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
length.value = len(s)
|
length.value = len(s)
|
||||||
# run first broadcast
|
# run first broadcast
|
||||||
_LIB.RabitBroadcast(ctypes.byref(length),
|
_check_call(_LIB.RabitBroadcast(ctypes.byref(length),
|
||||||
ctypes.sizeof(ctypes.c_ulong), root)
|
ctypes.sizeof(ctypes.c_ulong), root))
|
||||||
if root != rank:
|
if root != rank:
|
||||||
dptr = (ctypes.c_char * length.value)()
|
dptr = (ctypes.c_char * length.value)()
|
||||||
# run second
|
# run second
|
||||||
_LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
|
_check_call(_LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
|
||||||
length.value, root)
|
length.value, root))
|
||||||
data = pickle.loads(dptr.raw)
|
data = pickle.loads(dptr.raw)
|
||||||
del dptr
|
del dptr
|
||||||
else:
|
else:
|
||||||
_LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
|
_check_call(_LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
|
||||||
length.value, root)
|
length.value, root))
|
||||||
del s
|
del s
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@ -189,18 +189,18 @@ def allreduce(data, op, prepare_fun=None):
|
|||||||
if buf.dtype not in DTYPE_ENUM__:
|
if buf.dtype not in DTYPE_ENUM__:
|
||||||
raise Exception('data type %s not supported' % str(buf.dtype))
|
raise Exception('data type %s not supported' % str(buf.dtype))
|
||||||
if prepare_fun is None:
|
if prepare_fun is None:
|
||||||
_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
|
_check_call(_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
|
||||||
buf.size, DTYPE_ENUM__[buf.dtype],
|
buf.size, DTYPE_ENUM__[buf.dtype],
|
||||||
op, None, None)
|
op, None, None))
|
||||||
else:
|
else:
|
||||||
func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
|
func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
|
||||||
|
|
||||||
def pfunc(_):
|
def pfunc(_):
|
||||||
"""prepare function."""
|
"""prepare function."""
|
||||||
prepare_fun(data)
|
prepare_fun(data)
|
||||||
_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
|
_check_call(_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
|
||||||
buf.size, DTYPE_ENUM__[buf.dtype],
|
buf.size, DTYPE_ENUM__[buf.dtype],
|
||||||
op, func_ptr(pfunc), None)
|
op, func_ptr(pfunc), None))
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,90 +0,0 @@
|
|||||||
sudo: true
|
|
||||||
|
|
||||||
os:
|
|
||||||
- linux
|
|
||||||
- osx
|
|
||||||
|
|
||||||
osx_image: xcode10.2
|
|
||||||
|
|
||||||
dist: xenial
|
|
||||||
|
|
||||||
language: cpp
|
|
||||||
|
|
||||||
# Use Build Matrix to do lint and build seperately
|
|
||||||
env:
|
|
||||||
matrix:
|
|
||||||
- TASK=lint LINT_LANG=cpp
|
|
||||||
- TASK=lint LINT_LANG=python
|
|
||||||
- TASK=doc
|
|
||||||
# - TASK=build
|
|
||||||
- TASK=mpi-build
|
|
||||||
- TASK=cmake-test
|
|
||||||
|
|
||||||
matrix:
|
|
||||||
exclude:
|
|
||||||
- os: osx
|
|
||||||
env: TASK=lint LINT_LANG=cpp
|
|
||||||
- os: osx
|
|
||||||
env: TASK=lint LINT_LANG=python
|
|
||||||
- os: osx
|
|
||||||
env: TASK=doc
|
|
||||||
- os: osx
|
|
||||||
env: TASK=build
|
|
||||||
|
|
||||||
# dependent apt packages
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
sources:
|
|
||||||
- llvm-toolchain-trusty-5.0
|
|
||||||
- ubuntu-toolchain-r-test
|
|
||||||
- george-edison55-precise-backports
|
|
||||||
packages:
|
|
||||||
- doxygen
|
|
||||||
- wget
|
|
||||||
- git
|
|
||||||
- libcurl4-openssl-dev
|
|
||||||
- unzip
|
|
||||||
- python-numpy
|
|
||||||
- gcc-4.8
|
|
||||||
- g++-4.8
|
|
||||||
- openssh-client
|
|
||||||
- openssh-server
|
|
||||||
- python3
|
|
||||||
- python3-setuptools
|
|
||||||
- python3-pip
|
|
||||||
- tree
|
|
||||||
homebrew:
|
|
||||||
packages:
|
|
||||||
- gcc49
|
|
||||||
- openssl
|
|
||||||
- libgit2
|
|
||||||
- python3
|
|
||||||
update: true
|
|
||||||
|
|
||||||
before_install:
|
|
||||||
- git clone https://github.com/dmlc/dmlc-core
|
|
||||||
- export TRAVIS=./scripts/
|
|
||||||
- source ${TRAVIS}/travis_setup_env.sh
|
|
||||||
- ${TRAVIS}/travis_osx_install.sh
|
|
||||||
- source ./scripts/travis_setup.sh
|
|
||||||
|
|
||||||
script: scripts/travis_script.sh
|
|
||||||
|
|
||||||
cache:
|
|
||||||
directories:
|
|
||||||
- ${HOME}/.cache/usr
|
|
||||||
- ${HOME}/.cache/pip
|
|
||||||
- mpich
|
|
||||||
|
|
||||||
before_cache:
|
|
||||||
- ${TRAVIS}/travis_before_cache.sh
|
|
||||||
|
|
||||||
after_success:
|
|
||||||
- tree build
|
|
||||||
- bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c'
|
|
||||||
|
|
||||||
notifications:
|
|
||||||
# Emails are sent to the committer's git-configured email address by default,
|
|
||||||
email:
|
|
||||||
on_success: change
|
|
||||||
on_failure: always
|
|
||||||
@ -1,40 +1 @@
|
|||||||
# Rabit: Reliable Allreduce and Broadcast Interface
|
# This directory contains the CPU network module for XGBoost. The library originates from [RABIT](https://github.com/dmlc/rabit)
|
||||||
[](https://travis-ci.org/dmlc/rabit)
|
|
||||||
[](http://rabit.readthedocs.org/)
|
|
||||||
|
|
||||||
rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs.
|
|
||||||
|
|
||||||
* [Tutorial](guide)
|
|
||||||
* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
|
|
||||||
* You can also directly read the [interface header](include/rabit.h)
|
|
||||||
* [XGBoost](https://github.com/dmlc/xgboost)
|
|
||||||
- Rabit is one of the backbone library to support distributed XGBoost
|
|
||||||
|
|
||||||
## Features
|
|
||||||
All these features comes from the facts about small rabbit:)
|
|
||||||
* Portable: rabit is light weight and runs everywhere
|
|
||||||
- Rabit is a library instead of a framework, a program only needs to link the library to run
|
|
||||||
- Rabit only replies on a mechanism to start program, which was provided by most framework
|
|
||||||
- You can run rabit programs on many platforms, including Yarn(Hadoop), MPI using the same code
|
|
||||||
* Scalable and Flexible: rabit runs fast
|
|
||||||
* Rabit program use Allreduce to communicate, and do not suffer the cost between iterations of MapReduce abstraction.
|
|
||||||
- Programs can call rabit functions in any order, as opposed to frameworks where callbacks are offered and called by the framework, i.e. inversion of control principle.
|
|
||||||
- Programs persist over all the iterations, unless they fail and recover.
|
|
||||||
* Reliable: rabit dig burrows to avoid disasters
|
|
||||||
- Rabit programs can recover the model and results using synchronous function calls.
|
|
||||||
- Rabit programs can set rabit_boostrap_cache=1 to support allreduce/broadcast operations before loadcheckpoint
|
|
||||||
`
|
|
||||||
rabit::Init(); -> rabit::AllReduce(); -> rabit::loadCheckpoint(); -> for () { rabit::AllReduce(); rabit::Checkpoint();} -> rabit::Shutdown();
|
|
||||||
`
|
|
||||||
|
|
||||||
## Use Rabit
|
|
||||||
* Type make in the root folder will compile the rabit library in lib folder
|
|
||||||
* Add lib to the library path and include to the include path of compiler
|
|
||||||
* Languages: You can use rabit in C++ and python
|
|
||||||
- It is also possible to port the library to other languages
|
|
||||||
|
|
||||||
## Contributing
|
|
||||||
Rabit is an open-source library, contributions are welcomed, including:
|
|
||||||
* The rabit core library.
|
|
||||||
* Customized tracker script for new platforms and interface of new languages.
|
|
||||||
* Tutorial and examples about the library.
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
@PACKAGE_INIT@
|
|
||||||
|
|
||||||
include("${CMAKE_CURRENT_LIST_DIR}/@TARGETS_EXPORT_NAME@.cmake")
|
|
||||||
check_required_components("@PROJECT_NAME@")
|
|
||||||
@ -1,20 +0,0 @@
|
|||||||
# code copied from https://crascit.com/2015/07/25/cmake-gtest/
|
|
||||||
cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
|
|
||||||
|
|
||||||
project(googletest-download NONE)
|
|
||||||
|
|
||||||
include(ExternalProject)
|
|
||||||
|
|
||||||
ExternalProject_Add(
|
|
||||||
googletest
|
|
||||||
SOURCE_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-src"
|
|
||||||
BINARY_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-build"
|
|
||||||
GIT_REPOSITORY
|
|
||||||
https://github.com/google/googletest.git
|
|
||||||
GIT_TAG
|
|
||||||
release-1.8.0
|
|
||||||
CONFIGURE_COMMAND ""
|
|
||||||
BUILD_COMMAND ""
|
|
||||||
INSTALL_COMMAND ""
|
|
||||||
TEST_COMMAND ""
|
|
||||||
)
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
# the following code to fetch googletest
|
|
||||||
# is inspired by and adapted after https://crascit.com/2015/07/25/cmake-gtest/
|
|
||||||
# download and unpack googletest at configure time
|
|
||||||
|
|
||||||
macro(fetch_googletest _download_module_path _download_root)
|
|
||||||
set(GOOGLETEST_DOWNLOAD_ROOT ${_download_root})
|
|
||||||
configure_file(
|
|
||||||
${_download_module_path}/googletest-download.cmake
|
|
||||||
${_download_root}/CMakeLists.txt
|
|
||||||
@ONLY
|
|
||||||
)
|
|
||||||
unset(GOOGLETEST_DOWNLOAD_ROOT)
|
|
||||||
|
|
||||||
execute_process(
|
|
||||||
COMMAND
|
|
||||||
"${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
|
|
||||||
WORKING_DIRECTORY
|
|
||||||
${_download_root}
|
|
||||||
)
|
|
||||||
execute_process(
|
|
||||||
COMMAND
|
|
||||||
"${CMAKE_COMMAND}" --build .
|
|
||||||
WORKING_DIRECTORY
|
|
||||||
${_download_root}
|
|
||||||
)
|
|
||||||
|
|
||||||
# adds the targers: gtest, gtest_main, gmock, gmock_main
|
|
||||||
add_subdirectory(
|
|
||||||
${_download_root}/googletest-src
|
|
||||||
${_download_root}/googletest-build
|
|
||||||
)
|
|
||||||
endmacro()
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
numpy
|
numpy
|
||||||
breathe
|
breathe
|
||||||
commonmark
|
commonmark
|
||||||
|
|
||||||
|
|||||||
@ -41,7 +41,7 @@ RABIT_DLL bool RabitInit(int argc, char *argv[]);
|
|||||||
* call this function after you finished all jobs.
|
* call this function after you finished all jobs.
|
||||||
* \return true if rabit is initialized successfully otherwise false
|
* \return true if rabit is initialized successfully otherwise false
|
||||||
*/
|
*/
|
||||||
RABIT_DLL bool RabitFinalize(void);
|
RABIT_DLL int RabitFinalize(void);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief get rank of previous process in ring topology
|
* \brief get rank of previous process in ring topology
|
||||||
@ -91,8 +91,7 @@ RABIT_DLL void RabitGetProcessorName(char *out_name,
|
|||||||
* \param size the size of the data
|
* \param size the size of the data
|
||||||
* \param root the root of process
|
* \param root the root of process
|
||||||
*/
|
*/
|
||||||
RABIT_DLL void RabitBroadcast(void *sendrecv_data,
|
RABIT_DLL int RabitBroadcast(void *sendrecv_data, rbt_ulong size, int root);
|
||||||
rbt_ulong size, int root);
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
|
* \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
|
||||||
@ -110,12 +109,9 @@ RABIT_DLL void RabitBroadcast(void *sendrecv_data,
|
|||||||
* \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
|
* \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
|
||||||
* \sa ReturnType
|
* \sa ReturnType
|
||||||
*/
|
*/
|
||||||
RABIT_DLL void RabitAllgather(void *sendrecvbuf,
|
RABIT_DLL int RabitAllgather(void *sendrecvbuf, size_t total_size,
|
||||||
size_t total_size,
|
size_t beginIndex, size_t size_node_slice,
|
||||||
size_t beginIndex,
|
size_t size_prev_slice, int enum_dtype);
|
||||||
size_t size_node_slice,
|
|
||||||
size_t size_prev_slice,
|
|
||||||
int enum_dtype);
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief perform in-place allreduce, on sendrecvbuf
|
* \brief perform in-place allreduce, on sendrecvbuf
|
||||||
@ -133,14 +129,11 @@ RABIT_DLL void RabitAllgather(void *sendrecvbuf,
|
|||||||
* \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
|
* \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
|
||||||
* will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
|
* will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
|
||||||
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
||||||
* \param prepare_arg argument used to passed into the lazy preprocessing function
|
* \param prepare_arg argument used to passed into the lazy preprocessing function
|
||||||
*/
|
*/
|
||||||
RABIT_DLL void RabitAllreduce(void *sendrecvbuf,
|
RABIT_DLL int RabitAllreduce(void *sendrecvbuf, size_t count, int enum_dtype,
|
||||||
size_t count,
|
int enum_op, void (*prepare_fun)(void *arg),
|
||||||
int enum_dtype,
|
void *prepare_arg);
|
||||||
int enum_op,
|
|
||||||
void (*prepare_fun)(void *arg),
|
|
||||||
void *prepare_arg);
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief load latest check point
|
* \brief load latest check point
|
||||||
|
|||||||
@ -9,16 +9,6 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include "rabit/serializable.h"
|
#include "rabit/serializable.h"
|
||||||
|
|
||||||
#if (defined(__GNUC__) && !defined(__clang__))
|
|
||||||
#define _FILE __builtin_FILE()
|
|
||||||
#define _LINE __builtin_LINE()
|
|
||||||
#define _CALLER __builtin_FUNCTION()
|
|
||||||
#else
|
|
||||||
#define _FILE "N/A"
|
|
||||||
#define _LINE -1
|
|
||||||
#define _CALLER "N/A"
|
|
||||||
#endif // (defined(__GNUC__) && !defined(__clang__))
|
|
||||||
|
|
||||||
namespace MPI { // NOLINT
|
namespace MPI { // NOLINT
|
||||||
/*! \brief MPI data type just to be compatible with MPI reduce function*/
|
/*! \brief MPI data type just to be compatible with MPI reduce function*/
|
||||||
class Datatype;
|
class Datatype;
|
||||||
@ -65,18 +55,12 @@ class IEngine {
|
|||||||
* \param slice_begin beginning of the current slice
|
* \param slice_begin beginning of the current slice
|
||||||
* \param slice_end end of the current slice
|
* \param slice_end end of the current slice
|
||||||
* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
|
* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
virtual void Allgather(void *sendrecvbuf,
|
virtual void Allgather(void *sendrecvbuf,
|
||||||
size_t total_size,
|
size_t total_size,
|
||||||
size_t slice_begin,
|
size_t slice_begin,
|
||||||
size_t slice_end,
|
size_t slice_end,
|
||||||
size_t size_prev_slice,
|
size_t size_prev_slice) = 0;
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER) = 0;
|
|
||||||
/*!
|
/*!
|
||||||
* \brief performs in-place Allreduce, on sendrecvbuf
|
* \brief performs in-place Allreduce, on sendrecvbuf
|
||||||
* this function is NOT thread-safe
|
* this function is NOT thread-safe
|
||||||
@ -88,38 +72,20 @@ class IEngine {
|
|||||||
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
|
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
|
||||||
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
||||||
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
virtual void Allreduce(void *sendrecvbuf_,
|
virtual void Allreduce(void *sendrecvbuf_,
|
||||||
size_t type_nbytes,
|
size_t type_nbytes,
|
||||||
size_t count,
|
size_t count,
|
||||||
ReduceFunction reducer,
|
ReduceFunction reducer,
|
||||||
PreprocFunction prepare_fun = nullptr,
|
PreprocFunction prepare_fun = nullptr,
|
||||||
void *prepare_arg = nullptr,
|
void *prepare_arg = nullptr) = 0;
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER) = 0;
|
|
||||||
/*!
|
/*!
|
||||||
* \brief broadcasts data from root to every other node
|
* \brief broadcasts data from root to every other node
|
||||||
* \param sendrecvbuf_ buffer for both sending and receiving data
|
* \param sendrecvbuf_ buffer for both sending and receiving data
|
||||||
* \param size the size of the data to be broadcasted
|
* \param size the size of the data to be broadcasted
|
||||||
* \param root the root worker id to broadcast the data
|
* \param root the root worker id to broadcast the data
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
|
virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) = 0;
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER) = 0;
|
|
||||||
/*!
|
|
||||||
* \brief explicitly re-initialize everything before calling LoadCheckPoint
|
|
||||||
* call this function when IEngine throws an exception,
|
|
||||||
* this function should only be used for test purposes
|
|
||||||
*/
|
|
||||||
virtual void InitAfterException() = 0;
|
|
||||||
/*!
|
/*!
|
||||||
* \brief loads the latest check point
|
* \brief loads the latest check point
|
||||||
* \param global_model pointer to the globally shared model/state
|
* \param global_model pointer to the globally shared model/state
|
||||||
@ -250,18 +216,12 @@ enum DataType {
|
|||||||
* \param slice_begin beginning of the current slice
|
* \param slice_begin beginning of the current slice
|
||||||
* \param slice_end end of the current slice
|
* \param slice_end end of the current slice
|
||||||
* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
|
* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
void Allgather(void* sendrecvbuf,
|
void Allgather(void* sendrecvbuf,
|
||||||
size_t total_size,
|
size_t total_size,
|
||||||
size_t slice_begin,
|
size_t slice_begin,
|
||||||
size_t slice_end,
|
size_t slice_end,
|
||||||
size_t size_prev_slice,
|
size_t size_prev_slice);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
/*!
|
/*!
|
||||||
* \brief perform in-place Allreduce, on sendrecvbuf
|
* \brief perform in-place Allreduce, on sendrecvbuf
|
||||||
* this is an internal function used by rabit to be able to compile with MPI
|
* this is an internal function used by rabit to be able to compile with MPI
|
||||||
@ -276,9 +236,6 @@ void Allgather(void* sendrecvbuf,
|
|||||||
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf_.
|
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf_.
|
||||||
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
||||||
* \param prepare_arg argument used to pass into the lazy preprocessing function.
|
* \param prepare_arg argument used to pass into the lazy preprocessing function.
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
void Allreduce_(void *sendrecvbuf, // NOLINT
|
void Allreduce_(void *sendrecvbuf, // NOLINT
|
||||||
size_t type_nbytes,
|
size_t type_nbytes,
|
||||||
@ -287,10 +244,7 @@ void Allreduce_(void *sendrecvbuf, // NOLINT
|
|||||||
mpi::DataType dtype,
|
mpi::DataType dtype,
|
||||||
mpi::OpType op,
|
mpi::OpType op,
|
||||||
IEngine::PreprocFunction prepare_fun = nullptr,
|
IEngine::PreprocFunction prepare_fun = nullptr,
|
||||||
void *prepare_arg = nullptr,
|
void *prepare_arg = nullptr);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
/*!
|
/*!
|
||||||
* \brief handle for customized reducer, used to handle customized reduce
|
* \brief handle for customized reducer, used to handle customized reduce
|
||||||
* this class is mainly created for compatiblity issues with MPI's customized reduce
|
* this class is mainly created for compatiblity issues with MPI's customized reduce
|
||||||
@ -316,18 +270,13 @@ class ReduceHandle {
|
|||||||
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
|
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
|
||||||
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
||||||
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
void Allreduce(void *sendrecvbuf,
|
void Allreduce(void *sendrecvbuf,
|
||||||
size_t type_nbytes,
|
size_t type_nbytes,
|
||||||
size_t count,
|
size_t count,
|
||||||
IEngine::PreprocFunction prepare_fun = nullptr,
|
IEngine::PreprocFunction prepare_fun = nullptr,
|
||||||
void *prepare_arg = nullptr,
|
void *prepare_arg = nullptr);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
/*! \return the number of bytes occupied by the type */
|
/*! \return the number of bytes occupied by the type */
|
||||||
static int TypeSize(const MPI::Datatype &dtype);
|
static int TypeSize(const MPI::Datatype &dtype);
|
||||||
|
|
||||||
|
|||||||
@ -131,40 +131,28 @@ inline std::string GetProcessorName() {
|
|||||||
return engine::GetEngine()->GetHost();
|
return engine::GetEngine()->GetHost();
|
||||||
}
|
}
|
||||||
// broadcast data to all other nodes from root
|
// broadcast data to all other nodes from root
|
||||||
inline void Broadcast(void *sendrecv_data, size_t size, int root,
|
inline void Broadcast(void *sendrecv_data, size_t size, int root) {
|
||||||
const char* _file,
|
engine::GetEngine()->Broadcast(sendrecv_data, size, root);
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
engine::GetEngine()->Broadcast(sendrecv_data, size, root,
|
|
||||||
_file, _line, _caller);
|
|
||||||
}
|
}
|
||||||
template<typename DType>
|
template<typename DType>
|
||||||
inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
|
inline void Broadcast(std::vector<DType> *sendrecv_data, int root) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
size_t size = sendrecv_data->size();
|
size_t size = sendrecv_data->size();
|
||||||
Broadcast(&size, sizeof(size), root, _file, _line, _caller);
|
Broadcast(&size, sizeof(size), root);
|
||||||
if (sendrecv_data->size() != size) {
|
if (sendrecv_data->size() != size) {
|
||||||
sendrecv_data->resize(size);
|
sendrecv_data->resize(size);
|
||||||
}
|
}
|
||||||
if (size != 0) {
|
if (size != 0) {
|
||||||
Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root,
|
Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root);
|
||||||
_file, _line, _caller);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
inline void Broadcast(std::string *sendrecv_data, int root,
|
inline void Broadcast(std::string *sendrecv_data, int root) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
size_t size = sendrecv_data->length();
|
size_t size = sendrecv_data->length();
|
||||||
Broadcast(&size, sizeof(size), root, _file, _line, _caller);
|
Broadcast(&size, sizeof(size), root);
|
||||||
if (sendrecv_data->length() != size) {
|
if (sendrecv_data->length() != size) {
|
||||||
sendrecv_data->resize(size);
|
sendrecv_data->resize(size);
|
||||||
}
|
}
|
||||||
if (size != 0) {
|
if (size != 0) {
|
||||||
Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root,
|
Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root);
|
||||||
_file, _line, _caller);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -172,13 +160,9 @@ inline void Broadcast(std::string *sendrecv_data, int root,
|
|||||||
template<typename OP, typename DType>
|
template<typename OP, typename DType>
|
||||||
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
||||||
void (*prepare_fun)(void *arg),
|
void (*prepare_fun)(void *arg),
|
||||||
void *prepare_arg,
|
void *prepare_arg) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
|
engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
|
||||||
engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg,
|
engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg);
|
||||||
_file, _line, _caller);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// C++11 support for lambda prepare function
|
// C++11 support for lambda prepare function
|
||||||
@ -188,13 +172,9 @@ inline void InvokeLambda(void *fun) {
|
|||||||
}
|
}
|
||||||
template<typename OP, typename DType>
|
template<typename OP, typename DType>
|
||||||
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
||||||
std::function<void()> prepare_fun,
|
std::function<void()> prepare_fun) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
|
engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
|
||||||
engine::mpi::GetType<DType>(), OP::kType, InvokeLambda, &prepare_fun,
|
engine::mpi::GetType<DType>(), OP::kType, InvokeLambda, &prepare_fun);
|
||||||
_file, _line, _caller);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Performs inplace Allgather
|
// Performs inplace Allgather
|
||||||
@ -203,13 +183,10 @@ inline void Allgather(DType *sendrecvbuf,
|
|||||||
size_t totalSize,
|
size_t totalSize,
|
||||||
size_t beginIndex,
|
size_t beginIndex,
|
||||||
size_t sizeNodeSlice,
|
size_t sizeNodeSlice,
|
||||||
size_t sizePrevSlice,
|
size_t sizePrevSlice) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
engine::GetEngine()->Allgather(sendrecvbuf, totalSize * sizeof(DType), beginIndex * sizeof(DType),
|
engine::GetEngine()->Allgather(sendrecvbuf, totalSize * sizeof(DType), beginIndex * sizeof(DType),
|
||||||
(beginIndex + sizeNodeSlice) * sizeof(DType),
|
(beginIndex + sizeNodeSlice) * sizeof(DType),
|
||||||
sizePrevSlice * sizeof(DType), _file, _line, _caller);
|
sizePrevSlice * sizeof(DType));
|
||||||
}
|
}
|
||||||
#endif // C++11
|
#endif // C++11
|
||||||
|
|
||||||
@ -289,12 +266,9 @@ inline Reducer<DType, freduce>::Reducer() {
|
|||||||
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
|
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
|
||||||
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
|
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
|
||||||
void (*prepare_fun)(void *arg),
|
void (*prepare_fun)(void *arg),
|
||||||
void *prepare_arg,
|
void *prepare_arg) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun,
|
handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun,
|
||||||
prepare_arg, _file, _line, _caller);
|
prepare_arg);
|
||||||
}
|
}
|
||||||
// function to perform reduction for SerializeReducer
|
// function to perform reduction for SerializeReducer
|
||||||
template<typename DType>
|
template<typename DType>
|
||||||
@ -342,10 +316,7 @@ template<typename DType>
|
|||||||
inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
|
inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
|
||||||
size_t max_nbyte, size_t count,
|
size_t max_nbyte, size_t count,
|
||||||
void (*prepare_fun)(void *arg),
|
void (*prepare_fun)(void *arg),
|
||||||
void *prepare_arg,
|
void *prepare_arg) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
buffer_.resize(max_nbyte * count);
|
buffer_.resize(max_nbyte * count);
|
||||||
// setup closure
|
// setup closure
|
||||||
SerializeReduceClosure<DType> c;
|
SerializeReduceClosure<DType> c;
|
||||||
@ -353,34 +324,23 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
|
|||||||
c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;
|
c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;
|
||||||
// invoke here
|
// invoke here
|
||||||
handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
|
handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
|
||||||
SerializeReduceClosure<DType>::Invoke, &c,
|
SerializeReduceClosure<DType>::Invoke, &c);
|
||||||
_file, _line, _caller);
|
|
||||||
for (size_t i = 0; i < count; ++i) {
|
for (size_t i = 0; i < count; ++i) {
|
||||||
utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
|
utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
|
||||||
sendrecvobj[i].Load(fs);
|
sendrecvobj[i].Load(fs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if DMLC_USE_CXX11
|
|
||||||
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)g
|
template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)g
|
||||||
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
|
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
|
||||||
std::function<void()> prepare_fun,
|
std::function<void()> prepare_fun) {
|
||||||
const char* _file,
|
this->Allreduce(sendrecvbuf, count, InvokeLambda, &prepare_fun);
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
this->Allreduce(sendrecvbuf, count, InvokeLambda, &prepare_fun,
|
|
||||||
_file, _line, _caller);
|
|
||||||
}
|
}
|
||||||
template<typename DType>
|
template<typename DType>
|
||||||
inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
|
inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
|
||||||
size_t max_nbytes, size_t count,
|
size_t max_nbytes, size_t count,
|
||||||
std::function<void()> prepare_fun,
|
std::function<void()> prepare_fun) {
|
||||||
const char* _file,
|
this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda, &prepare_fun);
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda, &prepare_fun,
|
|
||||||
_file, _line, _caller);
|
|
||||||
}
|
}
|
||||||
#endif // DMLC_USE_CXX11
|
|
||||||
} // namespace rabit
|
} // namespace rabit
|
||||||
#endif // RABIT_INTERNAL_RABIT_INL_H_
|
#endif // RABIT_INTERNAL_RABIT_INL_H_
|
||||||
|
|||||||
@ -1,41 +0,0 @@
|
|||||||
/*!
|
|
||||||
* Copyright (c) 2014-2019 by Contributors
|
|
||||||
* \file timer.h
|
|
||||||
* \brief This file defines the utils for timing
|
|
||||||
* \author Tianqi Chen, Nacho, Tianyi
|
|
||||||
*/
|
|
||||||
#ifndef RABIT_INTERNAL_TIMER_H_
|
|
||||||
#define RABIT_INTERNAL_TIMER_H_
|
|
||||||
#include <ctime>
|
|
||||||
#ifdef __MACH__
|
|
||||||
#include <mach/clock.h>
|
|
||||||
#include <mach/mach.h>
|
|
||||||
#endif // __MACH__
|
|
||||||
#include "./utils.h"
|
|
||||||
|
|
||||||
namespace rabit {
|
|
||||||
namespace utils {
|
|
||||||
/*!
|
|
||||||
* \brief return time in seconds, not cross platform, avoid to use this in most places
|
|
||||||
*/
|
|
||||||
inline double GetTime() {
|
|
||||||
#ifdef __MACH__
|
|
||||||
clock_serv_t cclock;
|
|
||||||
mach_timespec_t mts;
|
|
||||||
host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
|
|
||||||
utils::Check(clock_get_time(cclock, &mts) == 0, "failed to get time");
|
|
||||||
mach_port_deallocate(mach_task_self(), cclock);
|
|
||||||
return static_cast<double>(mts.tv_sec) + static_cast<double>(mts.tv_nsec) * 1e-9;
|
|
||||||
#else
|
|
||||||
#if defined(__unix__) || defined(__linux__)
|
|
||||||
timespec ts;
|
|
||||||
utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
|
|
||||||
return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
|
|
||||||
#else
|
|
||||||
return static_cast<double>(time(NULL));
|
|
||||||
#endif // defined(__unix__) || defined(__linux__)
|
|
||||||
#endif // __MACH__
|
|
||||||
}
|
|
||||||
} // namespace utils
|
|
||||||
} // namespace rabit
|
|
||||||
#endif // RABIT_INTERNAL_TIMER_H_
|
|
||||||
@ -69,35 +69,10 @@ inline bool StringToBool(const char* s) {
|
|||||||
return CompareStringsCaseInsensitive(s, "true") == 0 || atoi(s) != 0;
|
return CompareStringsCaseInsensitive(s, "true") == 0 || atoi(s) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
|
||||||
* \brief handling of Assert error, caused by inappropriate input
|
|
||||||
* \param msg error message
|
|
||||||
*/
|
|
||||||
inline void HandleAssertError(const char *msg) {
|
|
||||||
LOG(FATAL) << msg;
|
|
||||||
}
|
|
||||||
/*!
|
|
||||||
* \brief handling of Check error, caused by inappropriate input
|
|
||||||
* \param msg error message
|
|
||||||
*/
|
|
||||||
inline void HandleCheckError(const char *msg) {
|
|
||||||
LOG(FATAL) << msg;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void HandlePrint(const char *msg) {
|
inline void HandlePrint(const char *msg) {
|
||||||
printf("%s", msg);
|
printf("%s", msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void HandleLogInfo(const char *fmt, ...) {
|
|
||||||
std::string msg(kPrintBuffer, '\0');
|
|
||||||
va_list args;
|
|
||||||
va_start(args, fmt);
|
|
||||||
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
|
||||||
va_end(args);
|
|
||||||
fprintf(stdout, "%s", msg.c_str());
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! \brief printf, prints messages to the console */
|
/*! \brief printf, prints messages to the console */
|
||||||
inline void Printf(const char *fmt, ...) {
|
inline void Printf(const char *fmt, ...) {
|
||||||
std::string msg(kPrintBuffer, '\0');
|
std::string msg(kPrintBuffer, '\0');
|
||||||
@ -108,15 +83,6 @@ inline void Printf(const char *fmt, ...) {
|
|||||||
HandlePrint(msg.c_str());
|
HandlePrint(msg.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
/*! \brief portable version of snprintf */
|
|
||||||
inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
|
|
||||||
va_list args;
|
|
||||||
va_start(args, fmt);
|
|
||||||
int ret = vsnprintf(buf, size, fmt, args);
|
|
||||||
va_end(args);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*! \brief assert a condition is true, use this to handle debug information */
|
/*! \brief assert a condition is true, use this to handle debug information */
|
||||||
inline void Assert(bool exp, const char *fmt, ...) {
|
inline void Assert(bool exp, const char *fmt, ...) {
|
||||||
if (!exp) {
|
if (!exp) {
|
||||||
@ -125,7 +91,7 @@ inline void Assert(bool exp, const char *fmt, ...) {
|
|||||||
va_start(args, fmt);
|
va_start(args, fmt);
|
||||||
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
HandleAssertError(msg.c_str());
|
LOG(FATAL) << msg;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,7 +103,7 @@ inline void Check(bool exp, const char *fmt, ...) {
|
|||||||
va_start(args, fmt);
|
va_start(args, fmt);
|
||||||
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
HandleCheckError(msg.c_str());
|
LOG(FATAL) << msg;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -149,7 +115,7 @@ inline void Error(const char *fmt, ...) {
|
|||||||
va_start(args, fmt);
|
va_start(args, fmt);
|
||||||
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
vsnprintf(&msg[0], kPrintBuffer, fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
HandleCheckError(msg.c_str());
|
LOG(FATAL) << msg;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // namespace utils
|
} // namespace utils
|
||||||
@ -176,15 +142,6 @@ inline T *BeginPtr(std::vector<T> &vec) { // NOLINT(*)
|
|||||||
return &vec[0];
|
return &vec[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*! \brief get the beginning address of a vector */
|
|
||||||
template<typename T>
|
|
||||||
inline const T *BeginPtr(const std::vector<T> &vec) { // NOLINT(*)
|
|
||||||
if (vec.size() == 0) {
|
|
||||||
return nullptr;
|
|
||||||
} else {
|
|
||||||
return &vec[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inline char* BeginPtr(std::string &str) { // NOLINT(*)
|
inline char* BeginPtr(std::string &str) { // NOLINT(*)
|
||||||
if (str.length() == 0) return nullptr;
|
if (str.length() == 0) return nullptr;
|
||||||
return &str[0];
|
return &str[0];
|
||||||
|
|||||||
@ -12,36 +12,7 @@
|
|||||||
#define RABIT_RABIT_H_ // NOLINT(*)
|
#define RABIT_RABIT_H_ // NOLINT(*)
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// whether or not use c++11 support
|
|
||||||
#ifndef DMLC_USE_CXX11
|
|
||||||
#if defined(__GXX_EXPERIMENTAL_CXX0X__) || defined(_MSC_VER)
|
|
||||||
#define DMLC_USE_CXX11 1
|
|
||||||
#else
|
|
||||||
#define DMLC_USE_CXX11 (__cplusplus >= 201103L)
|
|
||||||
#endif // defined(__GXX_EXPERIMENTAL_CXX0X__) || defined(_MSC_VER)
|
|
||||||
#endif // DMLC_USE_CXX11
|
|
||||||
|
|
||||||
// keeps rabit api caller signature
|
|
||||||
#ifndef RABIT_API_CALLER_SIGNATURE
|
|
||||||
#define RABIT_API_CALLER_SIGNATURE
|
|
||||||
|
|
||||||
#if (defined(__GNUC__) && !defined(__clang__))
|
|
||||||
#define _FILE __builtin_FILE()
|
|
||||||
#define _LINE __builtin_LINE()
|
|
||||||
#define _CALLER __builtin_FUNCTION()
|
|
||||||
#else
|
|
||||||
#define _FILE "N/A"
|
|
||||||
#define _LINE -1
|
|
||||||
#define _CALLER "N/A"
|
|
||||||
#endif // (defined(__GNUC__) && !defined(__clang__))
|
|
||||||
|
|
||||||
#endif // RABIT_API_CALLER_SIGNATURE
|
|
||||||
|
|
||||||
// optionally support of lambda functions in C++11, if available
|
|
||||||
#if DMLC_USE_CXX11
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#endif // C++11
|
|
||||||
// engine definition of rabit, defines internal implementation
|
// engine definition of rabit, defines internal implementation
|
||||||
// to use rabit interface, there is no need to read engine.h
|
// to use rabit interface, there is no need to read engine.h
|
||||||
// rabit.h and serializable.h are enough to use the interface
|
// rabit.h and serializable.h are enough to use the interface
|
||||||
@ -135,31 +106,19 @@ inline void TrackerPrintf(const char *fmt, ...);
|
|||||||
* \param sendrecv_data the pointer to the send/receive buffer,
|
* \param sendrecv_data the pointer to the send/receive buffer,
|
||||||
* \param size the data size
|
* \param size the data size
|
||||||
* \param root the process root
|
* \param root the process root
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
inline void Broadcast(void *sendrecv_data, size_t size, int root,
|
inline void Broadcast(void *sendrecv_data, size_t size, int root);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief broadcasts an std::vector<DType> to every node from root
|
* \brief broadcasts an std::vector<DType> to every node from root
|
||||||
* \param sendrecv_data the pointer to send/receive vector,
|
* \param sendrecv_data the pointer to send/receive vector,
|
||||||
* for the receiver, the vector does not need to be pre-allocated
|
* for the receiver, the vector does not need to be pre-allocated
|
||||||
* \param root the process root
|
* \param root the process root
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
* \tparam DType the data type stored in the vector, has to be a simple data type
|
* \tparam DType the data type stored in the vector, has to be a simple data type
|
||||||
* that can be directly transmitted by sending the sizeof(DType)
|
* that can be directly transmitted by sending the sizeof(DType)
|
||||||
*/
|
*/
|
||||||
template<typename DType>
|
template<typename DType>
|
||||||
inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
|
inline void Broadcast(std::vector<DType> *sendrecv_data, int root);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
/*!
|
/*!
|
||||||
* \brief broadcasts a std::string to every node from the root
|
* \brief broadcasts a std::string to every node from the root
|
||||||
* \param sendrecv_data the pointer to the send/receive buffer,
|
* \param sendrecv_data the pointer to the send/receive buffer,
|
||||||
@ -169,10 +128,7 @@ inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
|
|||||||
* \param _caller caller function name used to generate unique cache key
|
* \param _caller caller function name used to generate unique cache key
|
||||||
* \param root the process root
|
* \param root the process root
|
||||||
*/
|
*/
|
||||||
inline void Broadcast(std::string *sendrecv_data, int root,
|
inline void Broadcast(std::string *sendrecv_data, int root);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
/*!
|
/*!
|
||||||
* \brief performs in-place Allreduce on sendrecvbuf
|
* \brief performs in-place Allreduce on sendrecvbuf
|
||||||
* this function is NOT thread-safe
|
* this function is NOT thread-safe
|
||||||
@ -191,19 +147,13 @@ inline void Broadcast(std::string *sendrecv_data, int root,
|
|||||||
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
|
* will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
|
||||||
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
||||||
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
* \tparam OP see namespace op, reduce operator
|
* \tparam OP see namespace op, reduce operator
|
||||||
* \tparam DType data type
|
* \tparam DType data type
|
||||||
*/
|
*/
|
||||||
template<typename OP, typename DType>
|
template<typename OP, typename DType>
|
||||||
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
||||||
void (*prepare_fun)(void *) = nullptr,
|
void (*prepare_fun)(void *) = nullptr,
|
||||||
void *prepare_arg = nullptr,
|
void *prepare_arg = nullptr);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
|
* \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
|
||||||
@ -217,19 +167,13 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
|
|||||||
* \param slice_begin beginning of the current slice
|
* \param slice_begin beginning of the current slice
|
||||||
* \param slice_end end of the current slice
|
* \param slice_end end of the current slice
|
||||||
* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
|
* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
template<typename DType>
|
template<typename DType>
|
||||||
inline void Allgather(DType *sendrecvbuf_,
|
inline void Allgather(DType *sendrecvbuf_,
|
||||||
size_t total_size,
|
size_t total_size,
|
||||||
size_t slice_begin,
|
size_t slice_begin,
|
||||||
size_t slice_end,
|
size_t slice_end,
|
||||||
size_t size_prev_slice,
|
size_t size_prev_slice);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
|
|
||||||
// C++11 support for lambda prepare function
|
// C++11 support for lambda prepare function
|
||||||
#if DMLC_USE_CXX11
|
#if DMLC_USE_CXX11
|
||||||
@ -254,18 +198,12 @@ inline void Allgather(DType *sendrecvbuf_,
|
|||||||
* \param prepare_fun Lazy lambda preprocessing function, prepare_fun() will be invoked
|
* \param prepare_fun Lazy lambda preprocessing function, prepare_fun() will be invoked
|
||||||
* by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
|
* by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
|
||||||
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
* \tparam OP see namespace op, reduce operator
|
* \tparam OP see namespace op, reduce operator
|
||||||
* \tparam DType data type
|
* \tparam DType data type
|
||||||
*/
|
*/
|
||||||
template<typename OP, typename DType>
|
template<typename OP, typename DType>
|
||||||
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
||||||
std::function<void()> prepare_fun,
|
std::function<void()> prepare_fun);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
#endif // C++11
|
#endif // C++11
|
||||||
/*!
|
/*!
|
||||||
* \brief loads the latest check point
|
* \brief loads the latest check point
|
||||||
@ -361,31 +299,19 @@ class Reducer {
|
|||||||
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
|
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
|
||||||
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
||||||
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
||||||
void (*prepare_fun)(void *) = nullptr,
|
void (*prepare_fun)(void *) = nullptr,
|
||||||
void *prepare_arg = nullptr,
|
void *prepare_arg = nullptr);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
#if DMLC_USE_CXX11
|
#if DMLC_USE_CXX11
|
||||||
/*!
|
/*!
|
||||||
* \brief customized in-place all reduce operation, with lambda function as preprocessor
|
* \brief customized in-place all reduce operation, with lambda function as preprocessor
|
||||||
* \param sendrecvbuf pointer to the array of objects to be reduced
|
* \param sendrecvbuf pointer to the array of objects to be reduced
|
||||||
* \param count number of elements to be reduced
|
* \param count number of elements to be reduced
|
||||||
* \param prepare_fun lambda function executed to prepare the data, if necessary
|
* \param prepare_fun lambda function executed to prepare the data, if necessary
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
||||||
std::function<void()> prepare_fun,
|
std::function<void()> prepare_fun);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
#endif // DMLC_USE_CXX11
|
#endif // DMLC_USE_CXX11
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -416,17 +342,11 @@ class SerializeReducer {
|
|||||||
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
|
* will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
|
||||||
* If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
|
||||||
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
* \param prepare_arg argument used to pass into the lazy preprocessing function
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
inline void Allreduce(DType *sendrecvobj,
|
inline void Allreduce(DType *sendrecvobj,
|
||||||
size_t max_nbyte, size_t count,
|
size_t max_nbyte, size_t count,
|
||||||
void (*prepare_fun)(void *) = nullptr,
|
void (*prepare_fun)(void *) = nullptr,
|
||||||
void *prepare_arg = nullptr,
|
void *prepare_arg = nullptr);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
// C++11 support for lambda prepare function
|
// C++11 support for lambda prepare function
|
||||||
#if DMLC_USE_CXX11
|
#if DMLC_USE_CXX11
|
||||||
/*!
|
/*!
|
||||||
@ -436,16 +356,10 @@ class SerializeReducer {
|
|||||||
* this includes budget limit for intermediate and final result
|
* this includes budget limit for intermediate and final result
|
||||||
* \param count number of elements to be reduced
|
* \param count number of elements to be reduced
|
||||||
* \param prepare_fun lambda function executed to prepare the data, if necessary
|
* \param prepare_fun lambda function executed to prepare the data, if necessary
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
inline void Allreduce(DType *sendrecvobj,
|
inline void Allreduce(DType *sendrecvobj,
|
||||||
size_t max_nbyte, size_t count,
|
size_t max_nbyte, size_t count,
|
||||||
std::function<void()> prepare_fun,
|
std::function<void()> prepare_fun);
|
||||||
const char* _file = _FILE,
|
|
||||||
const int _line = _LINE,
|
|
||||||
const char* _caller = _CALLER);
|
|
||||||
#endif // DMLC_USE_CXX11
|
#endif // DMLC_USE_CXX11
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -1,15 +0,0 @@
|
|||||||
Rabit Library
|
|
||||||
=====
|
|
||||||
This folder holds the library file generated by the compiler. To generate the library file, type ```make``` in the project root folder. If you want mpi compatible library, type ```make mpi```
|
|
||||||
|
|
||||||
***List of Files***
|
|
||||||
* rabit.a The rabit package library
|
|
||||||
- Normally you need to link with this one
|
|
||||||
* rabit_mock.a The rabit package library with mock test
|
|
||||||
- This library allows additional mock-test
|
|
||||||
* rabit_mpi.a The MPI backed library
|
|
||||||
- Link against this library makes the program use MPI Allreduce
|
|
||||||
- This library is not fault-tolerant
|
|
||||||
* rabit_empty.a Dummy package implementation
|
|
||||||
- This is an empty library that does not provide anything
|
|
||||||
- Only introduced to minimize code dependency for projects that only need single machine code
|
|
||||||
@ -1,364 +0,0 @@
|
|||||||
"""
|
|
||||||
Reliable Allreduce and Broadcast Library.
|
|
||||||
|
|
||||||
Author: Tianqi Chen
|
|
||||||
"""
|
|
||||||
# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
|
|
||||||
import pickle
|
|
||||||
import ctypes
|
|
||||||
import os
|
|
||||||
import platform
|
|
||||||
import sys
|
|
||||||
import warnings
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
# version information about the doc
|
|
||||||
__version__ = '1.0'
|
|
||||||
|
|
||||||
_LIB = None
|
|
||||||
|
|
||||||
def _find_lib_path(dll_name):
|
|
||||||
"""Find the rabit dynamic library files.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
lib_path: list(string)
|
|
||||||
List of all found library path to rabit
|
|
||||||
"""
|
|
||||||
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
|
|
||||||
# make pythonpack hack: copy this directory one level upper for setup.py
|
|
||||||
dll_path = [curr_path,
|
|
||||||
os.path.join(curr_path, '../lib/'),
|
|
||||||
os.path.join(curr_path, './lib/')]
|
|
||||||
if os.name == 'nt':
|
|
||||||
dll_path = [os.path.join(p, dll_name) for p in dll_path]
|
|
||||||
else:
|
|
||||||
dll_path = [os.path.join(p, dll_name) for p in dll_path]
|
|
||||||
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
|
|
||||||
#From github issues, most of installation errors come from machines w/o compilers
|
|
||||||
if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
|
|
||||||
raise RuntimeError(
|
|
||||||
'Cannot find Rabit Libarary in the candicate path, ' +
|
|
||||||
'did you install compilers and run build.sh in root path?\n'
|
|
||||||
'List of candidates:\n' + ('\n'.join(dll_path)))
|
|
||||||
return lib_path
|
|
||||||
|
|
||||||
# load in xgboost library
|
|
||||||
def _loadlib(lib='standard', lib_dll=None):
|
|
||||||
"""Load rabit library."""
|
|
||||||
global _LIB
|
|
||||||
if _LIB is not None:
|
|
||||||
warnings.warn('rabit.int call was ignored because it has'\
|
|
||||||
' already been initialized', level=2)
|
|
||||||
return
|
|
||||||
|
|
||||||
if lib_dll is not None:
|
|
||||||
_LIB = lib_dll
|
|
||||||
return
|
|
||||||
|
|
||||||
if lib == 'standard':
|
|
||||||
dll_name = 'librabit'
|
|
||||||
else:
|
|
||||||
dll_name = 'librabit_' + lib
|
|
||||||
|
|
||||||
if os.name == 'nt':
|
|
||||||
dll_name += '.dll'
|
|
||||||
elif platform.system() == 'Darwin':
|
|
||||||
dll_name += '.dylib'
|
|
||||||
else:
|
|
||||||
dll_name += '.so'
|
|
||||||
|
|
||||||
_LIB = ctypes.cdll.LoadLibrary(_find_lib_path(dll_name)[0])
|
|
||||||
_LIB.RabitGetRank.restype = ctypes.c_int
|
|
||||||
_LIB.RabitGetWorldSize.restype = ctypes.c_int
|
|
||||||
_LIB.RabitVersionNumber.restype = ctypes.c_int
|
|
||||||
|
|
||||||
def _unloadlib():
|
|
||||||
"""Unload rabit library."""
|
|
||||||
global _LIB
|
|
||||||
del _LIB
|
|
||||||
_LIB = None
|
|
||||||
|
|
||||||
# reduction operators
|
|
||||||
MAX = 0
|
|
||||||
MIN = 1
|
|
||||||
SUM = 2
|
|
||||||
BITOR = 3
|
|
||||||
|
|
||||||
def init(args=None, lib='standard', lib_dll=None):
|
|
||||||
"""Intialize the rabit module, call this once before using anything.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
args: list of str, optional
|
|
||||||
The list of arguments used to initialized the rabit
|
|
||||||
usually you need to pass in sys.argv.
|
|
||||||
Defaults to sys.argv when it is None.
|
|
||||||
lib: {'standard', 'mock', 'mpi'}, optional
|
|
||||||
Type of library we want to load
|
|
||||||
When cdll is specified
|
|
||||||
lib_dll: ctypes.DLL, optional
|
|
||||||
The DLL object used as lib.
|
|
||||||
When this is presented argument lib will be ignored.
|
|
||||||
"""
|
|
||||||
if args is None:
|
|
||||||
args = []
|
|
||||||
_loadlib(lib, lib_dll)
|
|
||||||
arr = (ctypes.c_char_p * len(args))()
|
|
||||||
|
|
||||||
arr[:] = args
|
|
||||||
_LIB.RabitInit(len(args), arr)
|
|
||||||
|
|
||||||
def finalize():
|
|
||||||
"""Finalize the rabit engine.
|
|
||||||
|
|
||||||
Call this function after you finished all jobs.
|
|
||||||
"""
|
|
||||||
_LIB.RabitFinalize()
|
|
||||||
_unloadlib()
|
|
||||||
|
|
||||||
def get_rank():
|
|
||||||
"""Get rank of current process.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
rank : int
|
|
||||||
Rank of current process.
|
|
||||||
"""
|
|
||||||
ret = _LIB.RabitGetRank()
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def get_world_size():
|
|
||||||
"""Get total number workers.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
n : int
|
|
||||||
Total number of process.
|
|
||||||
"""
|
|
||||||
ret = _LIB.RabitGetWorldSize()
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def tracker_print(msg):
|
|
||||||
"""Print message to the tracker.
|
|
||||||
|
|
||||||
This function can be used to communicate the information of
|
|
||||||
the progress to the tracker
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
msg : str
|
|
||||||
The message to be printed to tracker.
|
|
||||||
"""
|
|
||||||
if not isinstance(msg, str):
|
|
||||||
msg = str(msg)
|
|
||||||
_LIB.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
|
|
||||||
|
|
||||||
def get_processor_name():
|
|
||||||
"""Get the processor name.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
name : str
|
|
||||||
the name of processor(host)
|
|
||||||
"""
|
|
||||||
mxlen = 256
|
|
||||||
length = ctypes.c_ulong()
|
|
||||||
buf = ctypes.create_string_buffer(mxlen)
|
|
||||||
_LIB.RabitGetProcessorName(buf, ctypes.byref(length), mxlen)
|
|
||||||
return buf.value
|
|
||||||
|
|
||||||
def broadcast(data, root):
|
|
||||||
"""Broadcast object from one node to all other nodes.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : any type that can be pickled
|
|
||||||
Input data, if current rank does not equal root, this can be None
|
|
||||||
root : int
|
|
||||||
Rank of the node to broadcast data from.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
object : int
|
|
||||||
the result of broadcast.
|
|
||||||
"""
|
|
||||||
rank = get_rank()
|
|
||||||
length = ctypes.c_ulong()
|
|
||||||
if root == rank:
|
|
||||||
assert data is not None, 'need to pass in data when broadcasting'
|
|
||||||
s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
|
|
||||||
length.value = len(s)
|
|
||||||
# run first broadcast
|
|
||||||
_LIB.RabitBroadcast(ctypes.byref(length),
|
|
||||||
ctypes.sizeof(ctypes.c_ulong), root)
|
|
||||||
if root != rank:
|
|
||||||
dptr = (ctypes.c_char * length.value)()
|
|
||||||
# run second
|
|
||||||
_LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
|
|
||||||
length.value, root)
|
|
||||||
data = pickle.loads(dptr.raw)
|
|
||||||
del dptr
|
|
||||||
else:
|
|
||||||
_LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
|
|
||||||
length.value, root)
|
|
||||||
del s
|
|
||||||
return data
|
|
||||||
|
|
||||||
# enumeration of dtypes
|
|
||||||
DTYPE_ENUM__ = {
|
|
||||||
np.dtype('int8') : 0,
|
|
||||||
np.dtype('uint8') : 1,
|
|
||||||
np.dtype('int32') : 2,
|
|
||||||
np.dtype('uint32') : 3,
|
|
||||||
np.dtype('int64') : 4,
|
|
||||||
np.dtype('uint64') : 5,
|
|
||||||
np.dtype('float32') : 6,
|
|
||||||
np.dtype('float64') : 7
|
|
||||||
}
|
|
||||||
|
|
||||||
def allreduce(data, op, prepare_fun=None):
|
|
||||||
"""Perform allreduce, return the result.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data: numpy array
|
|
||||||
Input data.
|
|
||||||
op: int
|
|
||||||
Reduction operators, can be MIN, MAX, SUM, BITOR
|
|
||||||
prepare_fun: function
|
|
||||||
Lazy preprocessing function, if it is not None, prepare_fun(data)
|
|
||||||
will be called by the function before performing allreduce, to intialize the data
|
|
||||||
If the result of Allreduce can be recovered directly,
|
|
||||||
then prepare_fun will NOT be called
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
result : array_like
|
|
||||||
The result of allreduce, have same shape as data
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
This function is not thread-safe.
|
|
||||||
"""
|
|
||||||
if not isinstance(data, np.ndarray):
|
|
||||||
raise Exception('allreduce only takes in numpy.ndarray')
|
|
||||||
buf = data.ravel()
|
|
||||||
if buf.base is data.base:
|
|
||||||
buf = buf.copy()
|
|
||||||
if buf.dtype not in DTYPE_ENUM__:
|
|
||||||
raise Exception('data type %s not supported' % str(buf.dtype))
|
|
||||||
if prepare_fun is None:
|
|
||||||
_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
|
|
||||||
buf.size, DTYPE_ENUM__[buf.dtype],
|
|
||||||
op, None, None)
|
|
||||||
else:
|
|
||||||
func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
|
|
||||||
def pfunc(args):
|
|
||||||
"""prepare function."""
|
|
||||||
prepare_fun(data)
|
|
||||||
_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
|
|
||||||
buf.size, DTYPE_ENUM__[buf.dtype],
|
|
||||||
op, func_ptr(pfunc), None)
|
|
||||||
return buf
|
|
||||||
|
|
||||||
|
|
||||||
def _load_model(ptr, length):
|
|
||||||
"""
|
|
||||||
Internal function used by the module,
|
|
||||||
unpickle a model from a buffer specified by ptr, length
|
|
||||||
Arguments:
|
|
||||||
ptr: ctypes.POINTER(ctypes._char)
|
|
||||||
pointer to the memory region of buffer
|
|
||||||
length: int
|
|
||||||
the length of buffer
|
|
||||||
"""
|
|
||||||
data = (ctypes.c_char * length).from_address(ctypes.addressof(ptr.contents))
|
|
||||||
return pickle.loads(data.raw)
|
|
||||||
|
|
||||||
def load_checkpoint(with_local=False):
|
|
||||||
"""Load latest check point.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
with_local: bool, optional
|
|
||||||
whether the checkpoint contains local model
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple : tuple
|
|
||||||
if with_local: return (version, gobal_model, local_model)
|
|
||||||
else return (version, gobal_model)
|
|
||||||
if returned version == 0, this means no model has been CheckPointed
|
|
||||||
and global_model, local_model returned will be None
|
|
||||||
"""
|
|
||||||
gptr = ctypes.POINTER(ctypes.c_char)()
|
|
||||||
global_len = ctypes.c_ulong()
|
|
||||||
if with_local:
|
|
||||||
lptr = ctypes.POINTER(ctypes.c_char)()
|
|
||||||
local_len = ctypes.c_ulong()
|
|
||||||
version = _LIB.RabitLoadCheckPoint(
|
|
||||||
ctypes.byref(gptr),
|
|
||||||
ctypes.byref(global_len),
|
|
||||||
ctypes.byref(lptr),
|
|
||||||
ctypes.byref(local_len))
|
|
||||||
if version == 0:
|
|
||||||
return (version, None, None)
|
|
||||||
return (version,
|
|
||||||
_load_model(gptr, global_len.value),
|
|
||||||
_load_model(lptr, local_len.value))
|
|
||||||
else:
|
|
||||||
version = _LIB.RabitLoadCheckPoint(
|
|
||||||
ctypes.byref(gptr),
|
|
||||||
ctypes.byref(global_len),
|
|
||||||
None, None)
|
|
||||||
if version == 0:
|
|
||||||
return (version, None)
|
|
||||||
return (version,
|
|
||||||
_load_model(gptr, global_len.value))
|
|
||||||
|
|
||||||
def checkpoint(global_model, local_model=None):
|
|
||||||
"""Checkpoint the model.
|
|
||||||
|
|
||||||
This means we finished a stage of execution.
|
|
||||||
Every time we call check point, there is a version number which will increase by one.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
global_model: anytype that can be pickled
|
|
||||||
globally shared model/state when calling this function,
|
|
||||||
the caller need to gauranttees that global_model is the same in all nodes
|
|
||||||
|
|
||||||
local_model: anytype that can be pickled
|
|
||||||
Local model, that is specific to current node/rank.
|
|
||||||
This can be None when no local state is needed.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----
|
|
||||||
local_model requires explicit replication of the model for fault-tolerance.
|
|
||||||
This will bring replication cost in checkpoint function.
|
|
||||||
while global_model do not need explicit replication.
|
|
||||||
It is recommended to use global_model if possible.
|
|
||||||
"""
|
|
||||||
sglobal = pickle.dumps(global_model)
|
|
||||||
if local_model is None:
|
|
||||||
_LIB.RabitCheckPoint(sglobal, len(sglobal), None, 0)
|
|
||||||
del sglobal
|
|
||||||
else:
|
|
||||||
slocal = pickle.dumps(local_model)
|
|
||||||
_LIB.RabitCheckPoint(sglobal, len(sglobal), slocal, len(slocal))
|
|
||||||
del slocal
|
|
||||||
del sglobal
|
|
||||||
|
|
||||||
def version_number():
|
|
||||||
"""Returns version number of current stored model.
|
|
||||||
|
|
||||||
This means how many calls to CheckPoint we made so far.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
version : int
|
|
||||||
Version number of currently stored model
|
|
||||||
"""
|
|
||||||
ret = _LIB.RabitVersionNumber()
|
|
||||||
return ret
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
if [ -f mpich/lib/libmpich.so ]; then
|
|
||||||
echo "libmpich.so found -- nothing to build."
|
|
||||||
else
|
|
||||||
echo "Downloading mpich source."
|
|
||||||
wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz
|
|
||||||
tar xfz mpich-3.2.tar.gz
|
|
||||||
rm mpich-3.2.tar.gz*
|
|
||||||
echo "configuring and building mpich."
|
|
||||||
cd mpich-3.2
|
|
||||||
#CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64
|
|
||||||
./configure \
|
|
||||||
--prefix=`pwd`/../mpich \
|
|
||||||
--enable-static=false \
|
|
||||||
--enable-alloca=true \
|
|
||||||
--disable-long-double \
|
|
||||||
--enable-threads=single \
|
|
||||||
--enable-fortran=no \
|
|
||||||
--enable-fast=all \
|
|
||||||
--enable-g=none \
|
|
||||||
--enable-timing=none \
|
|
||||||
--enable-cxx
|
|
||||||
make -j4
|
|
||||||
make install
|
|
||||||
cd -
|
|
||||||
fi
|
|
||||||
@ -1,7 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if [ ${TRAVIS_OS_NAME} != "osx" ]; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
conda activate python3
|
|
||||||
conda --version
|
|
||||||
python --version
|
|
||||||
|
|
||||||
make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k || exit -1
|
|
||||||
make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k_die_same || exit -1
|
|
||||||
make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k_die_hard || exit -1
|
|
||||||
make -f test.mk RABIT_BUILD_DMLC=1 local_recover_10_10k || exit -1
|
|
||||||
make -f test.mk RABIT_BUILD_DMLC=1 lazy_recover_10_10k_die_hard || exit -1
|
|
||||||
make -f test.mk RABIT_BUILD_DMLC=1 lazy_recover_10_10k_die_same || exit -1
|
|
||||||
make -f test.mk RABIT_BUILD_DMLC=1 ringallreduce_10_10k || exit -1
|
|
||||||
make -f test.mk RABIT_BUILD_DMLC=1 pylocal_recover_10_10k || exit -1
|
|
||||||
@ -1,36 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# main script of travis
|
|
||||||
if [ ${TASK} == "lint" ]; then
|
|
||||||
make lint RABIT_BUILD_DMLC=1 || exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${TASK} == "doc" ]; then
|
|
||||||
make doc 2>log.txt
|
|
||||||
(cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag" |grep warning) && exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# we should depreciate Makefile based build
|
|
||||||
if [ ${TASK} == "build" ]; then
|
|
||||||
make all RABIT_BUILD_DMLC=1 || exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${TASK} == "mpi-build" ]; then
|
|
||||||
./scripts/mpi_build.sh
|
|
||||||
cd test
|
|
||||||
make mpi RABIT_BUILD_DMLC=1 && make speed_test.mpi RABIT_BUILD_DMLC=1 || exit -1
|
|
||||||
fi
|
|
||||||
#
|
|
||||||
if [ ${TASK} == "cmake-test" ]; then
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
cmake -DRABIT_BUILD_TESTS=ON -DRABIT_BUILD_DMLC=ON -DGTEST_ROOT=${HOME}/.local ..
|
|
||||||
# known osx gtest 1.8 issue
|
|
||||||
cp ${HOME}/.local/lib/*.dylib .
|
|
||||||
make -j$(nproc)
|
|
||||||
make test
|
|
||||||
make install || exit -1
|
|
||||||
cd ../test
|
|
||||||
../scripts/travis_runtest.sh || exit -1
|
|
||||||
rm -rf ../build
|
|
||||||
fi
|
|
||||||
@ -1,50 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
echo "Testing on: ${TRAVIS_OS_NAME}, Home directory: ${HOME}"
|
|
||||||
|
|
||||||
# Install Miniconda
|
|
||||||
if [ ${TRAVIS_OS_NAME} == "osx" ]; then
|
|
||||||
wget -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
|
|
||||||
else
|
|
||||||
wget -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
|
||||||
fi
|
|
||||||
bash conda.sh -b -p $HOME/miniconda
|
|
||||||
source $HOME/miniconda/bin/activate
|
|
||||||
conda config --set always_yes yes --set changeps1 no
|
|
||||||
conda update -q conda
|
|
||||||
conda info -a
|
|
||||||
conda create -n python3 python=3.7
|
|
||||||
conda activate python3
|
|
||||||
conda --version
|
|
||||||
python --version
|
|
||||||
# Install Python packages
|
|
||||||
conda install -c conda-forge numpy scipy urllib3 websocket-client
|
|
||||||
python -m pip install cpplint pylint kubernetes
|
|
||||||
|
|
||||||
# Install googletest under home directory
|
|
||||||
GTEST_VERSION=1.8.1
|
|
||||||
GTEST_RELEASE=release-${GTEST_VERSION}.tar.gz
|
|
||||||
GTEST_TAR_BALL=googletest_${GTEST_RELEASE}
|
|
||||||
|
|
||||||
wget https://github.com/google/googletest/archive/${GTEST_RELEASE} -O ${GTEST_TAR_BALL}
|
|
||||||
echo "152b849610d91a9dfa1401293f43230c2e0c33f8 ${GTEST_TAR_BALL}" | sha1sum -c
|
|
||||||
tar -xf ${GTEST_TAR_BALL}
|
|
||||||
pushd .
|
|
||||||
|
|
||||||
cd googletest-release-${GTEST_VERSION}
|
|
||||||
mkdir build
|
|
||||||
cd build
|
|
||||||
echo "Installing to ${HOME}/.local"
|
|
||||||
cmake .. -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=${HOME}/.local
|
|
||||||
make -j$(nproc)
|
|
||||||
make install
|
|
||||||
|
|
||||||
popd
|
|
||||||
|
|
||||||
if [ ${TRAVIS_OS_NAME} == "linux" ]; then
|
|
||||||
sudo apt-get install tree
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${TRAVIS_OS_NAME} == "osx" ]; then
|
|
||||||
brew install python3
|
|
||||||
fi
|
|
||||||
@ -1,40 +0,0 @@
|
|||||||
# script to be sourced in travis yml
|
|
||||||
# setup all enviroment variables
|
|
||||||
|
|
||||||
export CACHE_PREFIX=${HOME}/.cache/usr
|
|
||||||
export PATH=${HOME}/.local/bin:${PATH}
|
|
||||||
export PATH=${PATH}:${CACHE_PREFIX}/bin
|
|
||||||
export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:${CACHE_PREFIX}/include
|
|
||||||
export C_INCLUDE_PATH=${C_INCLUDE_PATH}:${CACHE_PREFIX}/include
|
|
||||||
export LIBRARY_PATH=${LIBRARY_PATH}:${CACHE_PREFIX}/lib
|
|
||||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${CACHE_PREFIX}/lib
|
|
||||||
export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${CACHE_PREFIX}/lib
|
|
||||||
|
|
||||||
alias make="make -j4"
|
|
||||||
|
|
||||||
# setup the cache prefix folder
|
|
||||||
if [ ! -d ${HOME}/.cache ]; then
|
|
||||||
mkdir ${HOME}/.cache
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -d ${CACHE_PREFIX} ]; then
|
|
||||||
mkdir ${CACHE_PREFIX}
|
|
||||||
fi
|
|
||||||
if [ ! -d ${CACHE_PREFIX}/include ]; then
|
|
||||||
mkdir ${CACHE_PREFIX}/include
|
|
||||||
fi
|
|
||||||
if [ ! -d ${CACHE_PREFIX}/lib ]; then
|
|
||||||
mkdir ${CACHE_PREFIX}/lib
|
|
||||||
fi
|
|
||||||
if [ ! -d ${CACHE_PREFIX}/bin ]; then
|
|
||||||
mkdir ${CACHE_PREFIX}/bin
|
|
||||||
fi
|
|
||||||
|
|
||||||
# setup CUDA path if NVCC_PREFIX exists
|
|
||||||
if [ ! -z "$NVCC_PREFIX" ]; then
|
|
||||||
export PATH=${PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/bin
|
|
||||||
export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/include
|
|
||||||
export C_INCLUDE_PATH=${C_INCLUDE_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/include
|
|
||||||
export LIBRARY_PATH=${LIBRARY_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/lib64:${NVCC_PREFIX}/usr/lib/x86_64-linux-gnu
|
|
||||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/lib64:${NVCC_PREFIX}/usr/lib/x86_64-linux-gnu
|
|
||||||
fi
|
|
||||||
@ -275,7 +275,7 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) {
|
|||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
utils::TCPSocket tracker = this->ConnectTracker();
|
utils::TCPSocket tracker = this->ConnectTracker();
|
||||||
fprintf(stdout, "task %s connected to the tracker\n", task_id.c_str());
|
LOG(INFO) << "task " << task_id << " connected to the tracker";
|
||||||
tracker.SendStr(std::string(cmd));
|
tracker.SendStr(std::string(cmd));
|
||||||
|
|
||||||
// the rank of previous link, next link in ring
|
// the rank of previous link, next link in ring
|
||||||
|
|||||||
@ -98,14 +98,9 @@ class AllreduceBase : public IEngine {
|
|||||||
* \param slice_begin beginning of the current slice
|
* \param slice_begin beginning of the current slice
|
||||||
* \param slice_end end of the current slice
|
* \param slice_end end of the current slice
|
||||||
* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
|
* \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
void Allgather(void *sendrecvbuf_, size_t total_size, size_t slice_begin,
|
void Allgather(void *sendrecvbuf_, size_t total_size, size_t slice_begin,
|
||||||
size_t slice_end, size_t size_prev_slice,
|
size_t slice_end, size_t size_prev_slice) override {
|
||||||
const char *_file = _FILE, const int _line = _LINE,
|
|
||||||
const char *_caller = _CALLER) override {
|
|
||||||
if (world_size == 1 || world_size == -1) {
|
if (world_size == 1 || world_size == -1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -124,15 +119,10 @@ class AllreduceBase : public IEngine {
|
|||||||
* will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
|
* will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
|
||||||
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
* If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
|
||||||
* \param prepare_arg argument used to passed into the lazy preprocessing function
|
* \param prepare_arg argument used to passed into the lazy preprocessing function
|
||||||
* \param _file caller file name used to generate unique cache key
|
|
||||||
* \param _line caller line number used to generate unique cache key
|
|
||||||
* \param _caller caller function name used to generate unique cache key
|
|
||||||
*/
|
*/
|
||||||
void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count,
|
void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count,
|
||||||
ReduceFunction reducer, PreprocFunction prepare_fun = nullptr,
|
ReduceFunction reducer, PreprocFunction prepare_fun = nullptr,
|
||||||
void *prepare_arg = nullptr, const char *_file = _FILE,
|
void *prepare_arg = nullptr) override {
|
||||||
const int _line = _LINE,
|
|
||||||
const char *_caller = _CALLER) override {
|
|
||||||
if (prepare_fun != nullptr) prepare_fun(prepare_arg);
|
if (prepare_fun != nullptr) prepare_fun(prepare_arg);
|
||||||
if (world_size == 1 || world_size == -1) return;
|
if (world_size == 1 || world_size == -1) return;
|
||||||
utils::Assert(TryAllreduce(sendrecvbuf_, type_nbytes, count, reducer) ==
|
utils::Assert(TryAllreduce(sendrecvbuf_, type_nbytes, count, reducer) ==
|
||||||
@ -148,9 +138,7 @@ class AllreduceBase : public IEngine {
|
|||||||
* \param _line caller line number used to generate unique cache key
|
* \param _line caller line number used to generate unique cache key
|
||||||
* \param _caller caller function name used to generate unique cache key
|
* \param _caller caller function name used to generate unique cache key
|
||||||
*/
|
*/
|
||||||
void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
|
void Broadcast(void *sendrecvbuf_, size_t total_size, int root) override {
|
||||||
const char *_file = _FILE, const int _line = _LINE,
|
|
||||||
const char *_caller = _CALLER) override {
|
|
||||||
if (world_size == 1 || world_size == -1) return;
|
if (world_size == 1 || world_size == -1) return;
|
||||||
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
|
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
|
||||||
"Broadcast failed");
|
"Broadcast failed");
|
||||||
@ -232,14 +220,6 @@ class AllreduceBase : public IEngine {
|
|||||||
int VersionNumber() const override {
|
int VersionNumber() const override {
|
||||||
return version_number;
|
return version_number;
|
||||||
}
|
}
|
||||||
/*!
|
|
||||||
* \brief explicitly re-init everything before calling LoadCheckPoint
|
|
||||||
* call this function when IEngine throw an exception out,
|
|
||||||
* this function is only used for test purpose
|
|
||||||
*/
|
|
||||||
void InitAfterException() override {
|
|
||||||
utils::Error("InitAfterException: not implemented");
|
|
||||||
}
|
|
||||||
/*!
|
/*!
|
||||||
* \brief report current status to the job tracker
|
* \brief report current status to the job tracker
|
||||||
* depending on the job tracker we are in
|
* depending on the job tracker we are in
|
||||||
|
|||||||
@ -11,8 +11,8 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <dmlc/timer.h>
|
||||||
#include "rabit/internal/engine.h"
|
#include "rabit/internal/engine.h"
|
||||||
#include "rabit/internal/timer.h"
|
|
||||||
#include "allreduce_base.h"
|
#include "allreduce_base.h"
|
||||||
|
|
||||||
namespace rabit {
|
namespace rabit {
|
||||||
@ -46,36 +46,30 @@ class AllreduceMock : public AllreduceBase {
|
|||||||
}
|
}
|
||||||
void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count,
|
void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count,
|
||||||
ReduceFunction reducer, PreprocFunction prepare_fun,
|
ReduceFunction reducer, PreprocFunction prepare_fun,
|
||||||
void *prepare_arg, const char *_file = _FILE,
|
void *prepare_arg) override {
|
||||||
const int _line = _LINE,
|
|
||||||
const char *_caller = _CALLER) override {
|
|
||||||
this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "AllReduce");
|
this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "AllReduce");
|
||||||
double tstart = utils::GetTime();
|
double tstart = dmlc::GetTime();
|
||||||
AllreduceBase::Allreduce(sendrecvbuf_, type_nbytes, count, reducer,
|
AllreduceBase::Allreduce(sendrecvbuf_, type_nbytes, count, reducer,
|
||||||
prepare_fun, prepare_arg, _file, _line, _caller);
|
prepare_fun, prepare_arg);
|
||||||
tsum_allreduce_ += utils::GetTime() - tstart;
|
tsum_allreduce_ += dmlc::GetTime() - tstart;
|
||||||
}
|
}
|
||||||
void Allgather(void *sendrecvbuf, size_t total_size, size_t slice_begin,
|
void Allgather(void *sendrecvbuf, size_t total_size, size_t slice_begin,
|
||||||
size_t slice_end, size_t size_prev_slice,
|
size_t slice_end, size_t size_prev_slice) override {
|
||||||
const char *_file = _FILE, const int _line = _LINE,
|
|
||||||
const char *_caller = _CALLER) override {
|
|
||||||
this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "Allgather");
|
this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "Allgather");
|
||||||
double tstart = utils::GetTime();
|
double tstart = dmlc::GetTime();
|
||||||
AllreduceBase::Allgather(sendrecvbuf, total_size, slice_begin, slice_end,
|
AllreduceBase::Allgather(sendrecvbuf, total_size, slice_begin, slice_end,
|
||||||
size_prev_slice, _file, _line, _caller);
|
size_prev_slice);
|
||||||
tsum_allgather_ += utils::GetTime() - tstart;
|
tsum_allgather_ += dmlc::GetTime() - tstart;
|
||||||
}
|
}
|
||||||
void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
|
void Broadcast(void *sendrecvbuf_, size_t total_size, int root) override {
|
||||||
const char *_file = _FILE, const int _line = _LINE,
|
|
||||||
const char *_caller = _CALLER) override {
|
|
||||||
this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "Broadcast");
|
this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "Broadcast");
|
||||||
AllreduceBase::Broadcast(sendrecvbuf_, total_size, root, _file, _line, _caller);
|
AllreduceBase::Broadcast(sendrecvbuf_, total_size, root);
|
||||||
}
|
}
|
||||||
int LoadCheckPoint(Serializable *global_model,
|
int LoadCheckPoint(Serializable *global_model,
|
||||||
Serializable *local_model) override {
|
Serializable *local_model) override {
|
||||||
tsum_allreduce_ = 0.0;
|
tsum_allreduce_ = 0.0;
|
||||||
tsum_allgather_ = 0.0;
|
tsum_allgather_ = 0.0;
|
||||||
time_checkpoint_ = utils::GetTime();
|
time_checkpoint_ = dmlc::GetTime();
|
||||||
if (force_local_ == 0) {
|
if (force_local_ == 0) {
|
||||||
return AllreduceBase::LoadCheckPoint(global_model, local_model);
|
return AllreduceBase::LoadCheckPoint(global_model, local_model);
|
||||||
} else {
|
} else {
|
||||||
@ -87,7 +81,7 @@ class AllreduceMock : public AllreduceBase {
|
|||||||
void CheckPoint(const Serializable *global_model,
|
void CheckPoint(const Serializable *global_model,
|
||||||
const Serializable *local_model) override {
|
const Serializable *local_model) override {
|
||||||
this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "CheckPoint");
|
this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "CheckPoint");
|
||||||
double tstart = utils::GetTime();
|
double tstart = dmlc::GetTime();
|
||||||
double tbet_chkpt = tstart - time_checkpoint_;
|
double tbet_chkpt = tstart - time_checkpoint_;
|
||||||
if (force_local_ == 0) {
|
if (force_local_ == 0) {
|
||||||
AllreduceBase::CheckPoint(global_model, local_model);
|
AllreduceBase::CheckPoint(global_model, local_model);
|
||||||
@ -96,8 +90,8 @@ class AllreduceMock : public AllreduceBase {
|
|||||||
ComboSerializer com(global_model, local_model);
|
ComboSerializer com(global_model, local_model);
|
||||||
AllreduceBase::CheckPoint(&dum, &com);
|
AllreduceBase::CheckPoint(&dum, &com);
|
||||||
}
|
}
|
||||||
time_checkpoint_ = utils::GetTime();
|
time_checkpoint_ = dmlc::GetTime();
|
||||||
double tcost = utils::GetTime() - tstart;
|
double tcost = dmlc::GetTime() - tstart;
|
||||||
if (report_stats_ != 0 && rank == 0) {
|
if (report_stats_ != 0 && rank == 0) {
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "[v" << version_number << "] global_size="
|
ss << "[v" << version_number << "] global_size="
|
||||||
|
|||||||
@ -228,12 +228,12 @@ RABIT_DLL bool RabitInit(int argc, char *argv[]) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
RABIT_DLL bool RabitFinalize() {
|
RABIT_DLL int RabitFinalize() {
|
||||||
auto ret = rabit::Finalize();
|
auto ret = rabit::Finalize();
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
XGBAPISetLastError("Failed to shutdown RABIT worker.");
|
XGBAPISetLastError("Failed to shutdown RABIT worker.");
|
||||||
}
|
}
|
||||||
return ret;
|
return static_cast<int>(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
RABIT_DLL int RabitGetRingPrevRank() {
|
RABIT_DLL int RabitGetRingPrevRank() {
|
||||||
@ -270,37 +270,39 @@ RABIT_DLL void RabitGetProcessorName(char *out_name,
|
|||||||
*out_len = static_cast<rbt_ulong>(s.length());
|
*out_len = static_cast<rbt_ulong>(s.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
RABIT_DLL void RabitBroadcast(void *sendrecv_data,
|
RABIT_DLL int RabitBroadcast(void *sendrecv_data,
|
||||||
rbt_ulong size, int root) {
|
rbt_ulong size, int root) {
|
||||||
|
API_BEGIN()
|
||||||
rabit::Broadcast(sendrecv_data, size, root);
|
rabit::Broadcast(sendrecv_data, size, root);
|
||||||
|
API_END()
|
||||||
}
|
}
|
||||||
|
|
||||||
RABIT_DLL void RabitAllgather(void *sendrecvbuf_, size_t total_size,
|
RABIT_DLL int RabitAllgather(void *sendrecvbuf_, size_t total_size,
|
||||||
size_t beginIndex, size_t size_node_slice,
|
size_t beginIndex, size_t size_node_slice,
|
||||||
size_t size_prev_slice, int enum_dtype) {
|
size_t size_prev_slice, int enum_dtype) {
|
||||||
rabit::c_api::Allgather(sendrecvbuf_,
|
API_BEGIN()
|
||||||
total_size,
|
rabit::c_api::Allgather(
|
||||||
beginIndex,
|
sendrecvbuf_, total_size, beginIndex, size_node_slice, size_prev_slice,
|
||||||
size_node_slice,
|
static_cast<rabit::engine::mpi::DataType>(enum_dtype));
|
||||||
size_prev_slice,
|
API_END()
|
||||||
static_cast<rabit::engine::mpi::DataType>(enum_dtype));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
RABIT_DLL void RabitAllreduce(void *sendrecvbuf, size_t count, int enum_dtype,
|
RABIT_DLL int RabitAllreduce(void *sendrecvbuf, size_t count, int enum_dtype,
|
||||||
int enum_op, void (*prepare_fun)(void *arg),
|
int enum_op, void (*prepare_fun)(void *arg),
|
||||||
void *prepare_arg) {
|
void *prepare_arg) {
|
||||||
rabit::c_api::Allreduce
|
API_BEGIN()
|
||||||
(sendrecvbuf, count,
|
rabit::c_api::Allreduce(sendrecvbuf, count,
|
||||||
static_cast<rabit::engine::mpi::DataType>(enum_dtype),
|
static_cast<rabit::engine::mpi::DataType>(enum_dtype),
|
||||||
static_cast<rabit::engine::mpi::OpType>(enum_op),
|
static_cast<rabit::engine::mpi::OpType>(enum_op),
|
||||||
prepare_fun, prepare_arg);
|
prepare_fun, prepare_arg);
|
||||||
|
API_END()
|
||||||
}
|
}
|
||||||
|
|
||||||
RABIT_DLL int RabitLoadCheckPoint(char **out_global_model,
|
RABIT_DLL int RabitLoadCheckPoint(char **out_global_model,
|
||||||
rbt_ulong *out_global_len,
|
rbt_ulong *out_global_len,
|
||||||
char **out_local_model,
|
char **out_local_model,
|
||||||
rbt_ulong *out_local_len) {
|
rbt_ulong *out_local_len) {
|
||||||
// NOTE: this function is not thread-safe
|
// no-op as XGBoost 1.3
|
||||||
using rabit::BeginPtr;
|
using rabit::BeginPtr;
|
||||||
using namespace rabit::c_api; // NOLINT(*)
|
using namespace rabit::c_api; // NOLINT(*)
|
||||||
static std::string global_buffer;
|
static std::string global_buffer;
|
||||||
|
|||||||
@ -85,12 +85,9 @@ IEngine *GetEngine() {
|
|||||||
void Allgather(void *sendrecvbuf_, size_t total_size,
|
void Allgather(void *sendrecvbuf_, size_t total_size,
|
||||||
size_t slice_begin,
|
size_t slice_begin,
|
||||||
size_t slice_end,
|
size_t slice_end,
|
||||||
size_t size_prev_slice,
|
size_t size_prev_slice) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
GetEngine()->Allgather(sendrecvbuf_, total_size, slice_begin,
|
GetEngine()->Allgather(sendrecvbuf_, total_size, slice_begin,
|
||||||
slice_end, size_prev_slice, _file, _line, _caller);
|
slice_end, size_prev_slice);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -102,12 +99,9 @@ void Allreduce_(void *sendrecvbuf, // NOLINT
|
|||||||
mpi::DataType dtype,
|
mpi::DataType dtype,
|
||||||
mpi::OpType op,
|
mpi::OpType op,
|
||||||
IEngine::PreprocFunction prepare_fun,
|
IEngine::PreprocFunction prepare_fun,
|
||||||
void *prepare_arg,
|
void *prepare_arg) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count, red, prepare_fun,
|
GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count, red, prepare_fun,
|
||||||
prepare_arg, _file, _line, _caller);
|
prepare_arg);
|
||||||
}
|
}
|
||||||
|
|
||||||
// code for reduce handle
|
// code for reduce handle
|
||||||
@ -126,14 +120,10 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
|
|||||||
void ReduceHandle::Allreduce(void *sendrecvbuf,
|
void ReduceHandle::Allreduce(void *sendrecvbuf,
|
||||||
size_t type_nbytes, size_t count,
|
size_t type_nbytes, size_t count,
|
||||||
IEngine::PreprocFunction prepare_fun,
|
IEngine::PreprocFunction prepare_fun,
|
||||||
void *prepare_arg,
|
void *prepare_arg) {
|
||||||
const char* _file,
|
|
||||||
const int _line,
|
|
||||||
const char* _caller) {
|
|
||||||
utils::Assert(redfunc_ != nullptr, "must intialize handle to call AllReduce");
|
utils::Assert(redfunc_ != nullptr, "must intialize handle to call AllReduce");
|
||||||
GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
|
GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
|
||||||
redfunc_, prepare_fun, prepare_arg,
|
redfunc_, prepare_fun, prepare_arg);
|
||||||
_file, _line, _caller);
|
|
||||||
}
|
}
|
||||||
} // namespace engine
|
} // namespace engine
|
||||||
} // namespace rabit
|
} // namespace rabit
|
||||||
|
|||||||
3
rabit/test/.gitignore
vendored
3
rabit/test/.gitignore
vendored
@ -1,3 +0,0 @@
|
|||||||
*.mpi
|
|
||||||
*_test
|
|
||||||
*_recover
|
|
||||||
@ -1,77 +0,0 @@
|
|||||||
RABIT_BUILD_DMLC = 0
|
|
||||||
|
|
||||||
ifeq ($(RABIT_BUILD_DMLC),1)
|
|
||||||
DMLC=../dmlc-core
|
|
||||||
else
|
|
||||||
DMLC=../../dmlc-core
|
|
||||||
endif
|
|
||||||
|
|
||||||
MPICXX=../mpich/bin/mpicxx
|
|
||||||
export LDFLAGS= -L../lib -pthread -lm
|
|
||||||
export CFLAGS = -Wall -O3 -Wno-unknown-pragmas
|
|
||||||
|
|
||||||
export CC = gcc
|
|
||||||
export CXX = g++
|
|
||||||
|
|
||||||
|
|
||||||
#----------------------------
|
|
||||||
# Settings for power and arm arch
|
|
||||||
#----------------------------
|
|
||||||
ARCH := $(shell uname -a)
|
|
||||||
ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
|
|
||||||
CFLAGS += -march=native
|
|
||||||
else
|
|
||||||
CFLAGS += -msse2
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifndef WITH_FPIC
|
|
||||||
WITH_FPIC = 1
|
|
||||||
endif
|
|
||||||
ifeq ($(WITH_FPIC), 1)
|
|
||||||
CFLAGS += -fPIC
|
|
||||||
endif
|
|
||||||
|
|
||||||
CFLAGS += -I../include -I $(DMLC)/include -std=c++11
|
|
||||||
|
|
||||||
# specify tensor path
|
|
||||||
BIN = speed_test model_recover local_recover lazy_recover
|
|
||||||
OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
|
|
||||||
MPIBIN = speed_test.mpi
|
|
||||||
.PHONY: clean all lib mpi
|
|
||||||
|
|
||||||
.PHONY: lib all
|
|
||||||
|
|
||||||
all: $(BIN)
|
|
||||||
|
|
||||||
lib:
|
|
||||||
cd ..;make clean;make;cd -
|
|
||||||
|
|
||||||
.PHONY: mpi
|
|
||||||
mpi:
|
|
||||||
cd ..;make mpi;cd -
|
|
||||||
|
|
||||||
# programs
|
|
||||||
speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
|
|
||||||
model_recover.o: model_recover.cc ../include/rabit/*.h lib
|
|
||||||
local_recover.o: local_recover.cc ../include/rabit/*.h lib
|
|
||||||
lazy_recover.o: lazy_recover.cc ../include/rabit/*.h lib
|
|
||||||
|
|
||||||
# we can link against MPI version to get use MPI
|
|
||||||
speed_test: speed_test.o $(RABIT_OBJ)
|
|
||||||
speed_test.mpi: speed_test.o $(MPIOBJ)
|
|
||||||
model_recover: model_recover.o $(RABIT_OBJ)
|
|
||||||
local_recover: local_recover.o $(RABIT_OBJ)
|
|
||||||
lazy_recover: lazy_recover.o $(RABIT_OBJ)
|
|
||||||
|
|
||||||
$(BIN) :
|
|
||||||
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mock.a $(LDFLAGS)
|
|
||||||
|
|
||||||
$(OBJ) :
|
|
||||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
|
|
||||||
|
|
||||||
$(MPIBIN) :
|
|
||||||
$(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \
|
|
||||||
../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi
|
|
||||||
|
|
||||||
clean:
|
|
||||||
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
Testcases of Rabit
|
|
||||||
====
|
|
||||||
This folder contains internal testcases to test correctness and efficiency of rabit API
|
|
||||||
|
|
||||||
The example running scripts for testcases are given by test.mk
|
|
||||||
* type ```make -f test.mk testcasename``` to run certain testcase
|
|
||||||
|
|
||||||
|
|
||||||
Helper Scripts
|
|
||||||
====
|
|
||||||
* test.mk contains Makefile documentation of all testcases
|
|
||||||
* keepalive.sh helper bash to restart a program when it dies abnormally
|
|
||||||
|
|
||||||
List of Programs
|
|
||||||
====
|
|
||||||
* speed_test: test the running speed of rabit API
|
|
||||||
* test_local_recover: test recovery of local state when error happens
|
|
||||||
* test_model_recover: test recovery of global state when error happens
|
|
||||||
@ -1,125 +0,0 @@
|
|||||||
// this is a test case to test whether rabit can recover model when
|
|
||||||
// facing an exception
|
|
||||||
#include <rabit/rabit.h>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cmath>
|
|
||||||
using namespace rabit;
|
|
||||||
|
|
||||||
// dummy model
|
|
||||||
class Model : public rabit::Serializable {
|
|
||||||
public:
|
|
||||||
// iterations
|
|
||||||
std::vector<float> data;
|
|
||||||
// load from stream
|
|
||||||
virtual void Load(rabit::Stream *fi) {
|
|
||||||
fi->Read(&data);
|
|
||||||
}
|
|
||||||
/*! \brief save the model to the stream */
|
|
||||||
virtual void Save(rabit::Stream *fo) const {
|
|
||||||
fo->Write(data);
|
|
||||||
}
|
|
||||||
virtual void InitModel(size_t n) {
|
|
||||||
data.clear();
|
|
||||||
data.resize(n, 1.0f);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
inline void TestMax(Model *model, int ntrial, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
const int z = iter + 111;
|
|
||||||
|
|
||||||
std::vector<float> ndata(model->data.size());
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
ndata[i] = (i * (rank+1)) % z + model->data[i];
|
|
||||||
}
|
|
||||||
rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
float rmax = (i * 1) % z + model->data[i];
|
|
||||||
for (int r = 0; r < nproc; ++r) {
|
|
||||||
rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
|
|
||||||
}
|
|
||||||
utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestSum(Model *model, int ntrial, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
const int z = 131 + iter;
|
|
||||||
|
|
||||||
std::vector<float> ndata(model->data.size());
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
ndata[i] = (i * (rank+1)) % z + model->data[i];
|
|
||||||
}
|
|
||||||
Allreduce<op::Sum>(&ndata[0], ndata.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
float rsum = model->data[i] * nproc;
|
|
||||||
for (int r = 0; r < nproc; ++r) {
|
|
||||||
rsum += (float)((i * (r+1)) % z);
|
|
||||||
}
|
|
||||||
utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
|
|
||||||
"[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
|
|
||||||
}
|
|
||||||
model->data = ndata;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestBcast(size_t n, int root, int ntrial, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
std::string s; s.resize(n);
|
|
||||||
for (size_t i = 0; i < n; ++i) {
|
|
||||||
s[i] = char(i % 126 + 1);
|
|
||||||
}
|
|
||||||
std::string res;
|
|
||||||
if (root == rank) {
|
|
||||||
res = s;
|
|
||||||
rabit::Broadcast(&res, root);
|
|
||||||
} else {
|
|
||||||
rabit::Broadcast(&res, root);
|
|
||||||
}
|
|
||||||
utils::Check(res == s, "[%d] TestBcast fail", rank);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
if (argc < 3) {
|
|
||||||
printf("Usage: <ndata> <config>\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
int n = atoi(argv[1]);
|
|
||||||
rabit::Init(argc, argv);
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
std::string name = rabit::GetProcessorName();
|
|
||||||
Model model;
|
|
||||||
srand(0);
|
|
||||||
int ntrial = 0;
|
|
||||||
for (int i = 1; i < argc; ++i) {
|
|
||||||
int n;
|
|
||||||
if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n;
|
|
||||||
}
|
|
||||||
int iter = rabit::LoadCheckPoint(&model);
|
|
||||||
if (iter == 0) {
|
|
||||||
model.InitModel(n);
|
|
||||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
|
||||||
} else {
|
|
||||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
|
||||||
}
|
|
||||||
for (int r = iter; r < 3; ++r) {
|
|
||||||
TestMax(&model, ntrial, r);
|
|
||||||
printf("[%d] !!!TestMax pass, iter=%d\n", rank, r);
|
|
||||||
int step = std::max(nproc / 3, 1);
|
|
||||||
for (int i = 0; i < nproc; i += step) {
|
|
||||||
TestBcast(n, i, ntrial, r);
|
|
||||||
}
|
|
||||||
printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
|
|
||||||
TestSum(&model, ntrial, r);
|
|
||||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
|
||||||
rabit::LazyCheckPoint(&model);
|
|
||||||
printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
|
|
||||||
}
|
|
||||||
rabit::Finalize();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@ -1,137 +0,0 @@
|
|||||||
// this is a test case to test whether rabit can recover model when
|
|
||||||
// facing an exception
|
|
||||||
#include <rabit/rabit.h>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
using namespace rabit;
|
|
||||||
|
|
||||||
// dummy model
|
|
||||||
class Model : public rabit::Serializable {
|
|
||||||
public:
|
|
||||||
// iterations
|
|
||||||
std::vector<float> data;
|
|
||||||
// load from stream
|
|
||||||
virtual void Load(rabit::Stream *fi) {
|
|
||||||
fi->Read(&data);
|
|
||||||
}
|
|
||||||
/*! \brief save the model to the stream */
|
|
||||||
virtual void Save(rabit::Stream *fo) const {
|
|
||||||
fo->Write(data);
|
|
||||||
}
|
|
||||||
virtual void InitModel(size_t n, float v) {
|
|
||||||
data.clear();
|
|
||||||
data.resize(n, v);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
inline void TestMax(Model *model, Model *local, int ntrial, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
const int z = iter + 111;
|
|
||||||
std::vector<float> ndata(model->data.size());
|
|
||||||
rabit::Allreduce<op::Max>(&ndata[0], ndata.size(),
|
|
||||||
[&]() {
|
|
||||||
// use lambda expression to prepare the data
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
ndata[i] = (i * (rank+1)) % z + local->data[i];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
float rmax = (i * 1) % z + model->data[i];
|
|
||||||
for (int r = 0; r < nproc; ++r) {
|
|
||||||
rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i] + r);
|
|
||||||
}
|
|
||||||
utils::Check(rmax == ndata[i], "[%d] TestMax check failure", rank);
|
|
||||||
}
|
|
||||||
model->data = ndata;
|
|
||||||
local->data = ndata;
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
local->data[i] = ndata[i] + rank;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestSum(Model *model, Model *local, int ntrial, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
const int z = 131 + iter;
|
|
||||||
|
|
||||||
std::vector<float> ndata(model->data.size());
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
ndata[i] = (i * (rank+1)) % z + local->data[i];
|
|
||||||
}
|
|
||||||
Allreduce<op::Sum>(&ndata[0], ndata.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
float rsum = 0.0f;
|
|
||||||
for (int r = 0; r < nproc; ++r) {
|
|
||||||
rsum += (float)((i * (r+1)) % z) + model->data[i] + r;
|
|
||||||
}
|
|
||||||
utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
|
|
||||||
"[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
|
|
||||||
}
|
|
||||||
model->data = ndata;
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
local->data[i] = ndata[i] + rank;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestBcast(size_t n, int root, int ntrial, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
std::string s; s.resize(n);
|
|
||||||
for (size_t i = 0; i < n; ++i) {
|
|
||||||
s[i] = char(i % 126 + 1);
|
|
||||||
}
|
|
||||||
std::string res;
|
|
||||||
if (root == rank) {
|
|
||||||
res = s;
|
|
||||||
rabit::Broadcast(&res, root);
|
|
||||||
} else {
|
|
||||||
rabit::Broadcast(&res, root);
|
|
||||||
}
|
|
||||||
utils::Check(res == s, "[%d] TestBcast fail", rank);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
if (argc < 3) {
|
|
||||||
printf("Usage: <ndata>\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
int n = atoi(argv[1]);
|
|
||||||
rabit::Init(argc, argv);
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
std::string name = rabit::GetProcessorName();
|
|
||||||
Model model, local;
|
|
||||||
srand(0);
|
|
||||||
int ntrial = 0;
|
|
||||||
for (int i = 1; i < argc; ++i) {
|
|
||||||
int n;
|
|
||||||
if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n;
|
|
||||||
}
|
|
||||||
int iter = rabit::LoadCheckPoint(&model, &local);
|
|
||||||
if (iter == 0) {
|
|
||||||
model.InitModel(n, 1.0f);
|
|
||||||
local.InitModel(n, 1.0f + rank);
|
|
||||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
|
||||||
} else {
|
|
||||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
|
||||||
}
|
|
||||||
for (int r = iter; r < 3; ++r) {
|
|
||||||
TestMax(&model, &local, ntrial, r);
|
|
||||||
printf("[%d] !!!TestMax pass, iter=%d\n", rank, r);
|
|
||||||
int step = std::max(nproc / 3, 1);
|
|
||||||
for (int i = 0; i < nproc; i += step) {
|
|
||||||
TestBcast(n, i, ntrial, r);
|
|
||||||
}
|
|
||||||
printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
|
|
||||||
TestSum(&model, &local, ntrial, r);
|
|
||||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
|
||||||
rabit::CheckPoint(&model, &local);
|
|
||||||
printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
|
|
||||||
}
|
|
||||||
rabit::Finalize();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
from __future__ import print_function
|
|
||||||
from builtins import range
|
|
||||||
|
|
||||||
import sys
|
|
||||||
sys.path.append('../python')
|
|
||||||
|
|
||||||
import rabit
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
rabit.init(lib='mock')
|
|
||||||
rank = rabit.get_rank()
|
|
||||||
n = 10
|
|
||||||
nround = 3
|
|
||||||
data = np.ones(n) * rank
|
|
||||||
|
|
||||||
version, model, local = rabit.load_checkpoint(True)
|
|
||||||
if version == 0:
|
|
||||||
model = np.zeros(n)
|
|
||||||
local = np.ones(n)
|
|
||||||
else:
|
|
||||||
print('[%d] restart from version %d' % (rank, version))
|
|
||||||
|
|
||||||
for i in range(version, nround):
|
|
||||||
res = rabit.allreduce(data + model+local, rabit.SUM)
|
|
||||||
print('[%d] iter=%d: %s' % (rank, i, str(res)))
|
|
||||||
model = res
|
|
||||||
local[:] = i
|
|
||||||
rabit.checkpoint(model, local)
|
|
||||||
|
|
||||||
rabit.finalize()
|
|
||||||
@ -1,157 +0,0 @@
|
|||||||
// this is a test case to test whether rabit can recover model when
|
|
||||||
// facing an exception
|
|
||||||
#include <rabit/rabit.h>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
using namespace rabit;
|
|
||||||
|
|
||||||
// dummy model
|
|
||||||
class Model : public rabit::Serializable {
|
|
||||||
public:
|
|
||||||
// iterations
|
|
||||||
std::vector<float> data;
|
|
||||||
// load from stream
|
|
||||||
virtual void Load(rabit::Stream *fi) {
|
|
||||||
fi->Read(&data);
|
|
||||||
}
|
|
||||||
/*! \brief save the model to the stream */
|
|
||||||
virtual void Save(rabit::Stream *fo) const {
|
|
||||||
fo->Write(data);
|
|
||||||
}
|
|
||||||
virtual void InitModel(size_t n) {
|
|
||||||
data.clear();
|
|
||||||
data.resize(n, 1.0f);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
inline void TestMax(Model *model, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
const int z = iter + 111;
|
|
||||||
|
|
||||||
std::vector<float> ndata(model->data.size());
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
ndata[i] = (i * (rank+1)) % z + model->data[i];
|
|
||||||
}
|
|
||||||
rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
float rmax = (i * 1) % z + model->data[i];
|
|
||||||
for (int r = 0; r < nproc; ++r) {
|
|
||||||
rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
|
|
||||||
}
|
|
||||||
utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
|
|
||||||
}
|
|
||||||
model->data = ndata;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestSum(Model *model, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
const int z = 131 + iter;
|
|
||||||
|
|
||||||
std::vector<float> ndata(model->data.size());
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
ndata[i] = (i * (rank+1)) % z + model->data[i];
|
|
||||||
}
|
|
||||||
Allreduce<op::Sum>(&ndata[0], ndata.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
float rsum = model->data[i] * nproc;
|
|
||||||
for (int r = 0; r < nproc; ++r) {
|
|
||||||
rsum += (float)((i * (r+1)) % z);
|
|
||||||
}
|
|
||||||
utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
|
|
||||||
"[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
|
|
||||||
}
|
|
||||||
model->data = ndata;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestAllgather(Model *model, int iter) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
const int z = 131 + iter;
|
|
||||||
|
|
||||||
std::vector<float> ndata(model->data.size() * nproc);
|
|
||||||
size_t beginSlice = rank * model->data.size();
|
|
||||||
for (size_t i = 0; i < model->data.size(); ++i) {
|
|
||||||
ndata[beginSlice + i] = (i * (rank+1)) % z + model->data[i];
|
|
||||||
}
|
|
||||||
Allgather(&ndata[0], ndata.size(), beginSlice,
|
|
||||||
model->data.size(), model->data.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
int curRank = i / model->data.size();
|
|
||||||
int remainder = i % model->data.size();
|
|
||||||
float data = (remainder * (curRank+1)) % z + model->data[remainder];
|
|
||||||
utils::Check(fabsf(data - ndata[i]) < 1e-5 ,
|
|
||||||
"[%d] TestAllgather check failure, local=%g, allgatherring=%g", rank, data, ndata[i]);
|
|
||||||
}
|
|
||||||
model->data = ndata;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestBcast(size_t n, int root) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
std::string s; s.resize(n);
|
|
||||||
for (size_t i = 0; i < n; ++i) {
|
|
||||||
s[i] = char(i % 126 + 1);
|
|
||||||
}
|
|
||||||
std::string res;
|
|
||||||
if (root == rank) {
|
|
||||||
res = s;
|
|
||||||
}
|
|
||||||
rabit::Broadcast(&res, root);
|
|
||||||
|
|
||||||
utils::Check(res == s, "[%d] TestBcast fail", rank);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
if (argc < 3) {
|
|
||||||
printf("Usage: <ndata> <config>\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
int n = atoi(argv[1]);
|
|
||||||
rabit::Init(argc, argv);
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
std::string name = rabit::GetProcessorName();
|
|
||||||
|
|
||||||
int max_rank = rank;
|
|
||||||
rabit::Allreduce<op::Max>(&max_rank, 1);
|
|
||||||
utils::Check(max_rank == nproc - 1, "max rank is world size-1");
|
|
||||||
|
|
||||||
Model model;
|
|
||||||
srand(0);
|
|
||||||
int ntrial = 0;
|
|
||||||
for (int i = 1; i < argc; ++i) {
|
|
||||||
int n;
|
|
||||||
if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n;
|
|
||||||
}
|
|
||||||
int iter = rabit::LoadCheckPoint(&model);
|
|
||||||
if (iter == 0) {
|
|
||||||
model.InitModel(n);
|
|
||||||
}
|
|
||||||
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
|
|
||||||
|
|
||||||
for (int r = iter; r < 3; ++r) {
|
|
||||||
TestMax(&model, r);
|
|
||||||
printf("[%d] !!!TestMax pass, iter=%d\n", rank, r);
|
|
||||||
int step = std::max(nproc / 3, 1);
|
|
||||||
for (int i = 0; i < nproc; i += step) {
|
|
||||||
TestBcast(n, i);
|
|
||||||
}
|
|
||||||
printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
|
|
||||||
|
|
||||||
TestSum(&model, r);
|
|
||||||
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
|
|
||||||
TestAllgather(&model, r);
|
|
||||||
printf("[%d] !!!TestAllgather pass, iter=%d\n", rank, r);
|
|
||||||
rabit::CheckPoint(&model);
|
|
||||||
printf("[%d] !!!Checkpoint pass, iter=%d\n", rank, r);
|
|
||||||
}
|
|
||||||
rabit::Finalize();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,34 +0,0 @@
|
|||||||
import os
|
|
||||||
import argparse
|
|
||||||
import sys
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description='TODO')
|
|
||||||
parser.add_argument('-ho', '--host_dir', required=True)
|
|
||||||
parser.add_argument('-s', '--submit_script', required=True)
|
|
||||||
parser.add_argument('-rex', '--rabit_exec', required=True)
|
|
||||||
parser.add_argument('-mpi', '--mpi_exec', required=True)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
ndata = [10**4, 10**5, 10**6, 10**7]
|
|
||||||
nrepeat = [10**4, 10**3, 10**2, 10]
|
|
||||||
|
|
||||||
machines = [2,4,8,16,31]
|
|
||||||
|
|
||||||
executables = [args.rabit_exec, args.mpi_exec]
|
|
||||||
|
|
||||||
for executable in executables:
|
|
||||||
sys.stderr.write('Executable %s' % executable)
|
|
||||||
sys.stderr.flush()
|
|
||||||
for i, data in enumerate(ndata):
|
|
||||||
for machine in machines:
|
|
||||||
host_file = os.path.join(args.host_dir, 'hosts%d' % machine)
|
|
||||||
cmd = 'python %s %d %s %s %d %d' % (args.submit_script, machine, host_file, executable, data, nrepeat[i])
|
|
||||||
sys.stderr.write('data=%d, repeat=%d, machine=%d\n' % (data, nrepeat[i], machine))
|
|
||||||
sys.stderr.flush()
|
|
||||||
os.system(cmd)
|
|
||||||
sys.stderr.write('\n')
|
|
||||||
sys.stderr.flush()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@ -1,99 +0,0 @@
|
|||||||
// This program is used to test the speed of rabit API
|
|
||||||
#include <rabit/rabit.h>
|
|
||||||
#include <rabit/internal/timer.h>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cmath>
|
|
||||||
#include <time.h>
|
|
||||||
|
|
||||||
using namespace rabit;
|
|
||||||
|
|
||||||
double max_tdiff, sum_tdiff, bcast_tdiff, tot_tdiff;
|
|
||||||
|
|
||||||
inline void TestMax(size_t n) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
std::vector<float> ndata(n);
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
ndata[i] = (i * (rank+1)) % 111;
|
|
||||||
}
|
|
||||||
double tstart = utils::GetTime();
|
|
||||||
rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
|
|
||||||
max_tdiff += utils::GetTime() - tstart;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestSum(size_t n) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
const int z = 131;
|
|
||||||
std::vector<float> ndata(n);
|
|
||||||
for (size_t i = 0; i < ndata.size(); ++i) {
|
|
||||||
ndata[i] = (i * (rank+1)) % z;
|
|
||||||
}
|
|
||||||
double tstart = utils::GetTime();
|
|
||||||
rabit::Allreduce<op::Sum>(&ndata[0], ndata.size());
|
|
||||||
sum_tdiff += utils::GetTime() - tstart;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TestBcast(size_t n, int root) {
|
|
||||||
int rank = rabit::GetRank();
|
|
||||||
std::string s; s.resize(n);
|
|
||||||
for (size_t i = 0; i < n; ++i) {
|
|
||||||
s[i] = char(i % 126 + 1);
|
|
||||||
}
|
|
||||||
std::string res;
|
|
||||||
res.resize(n);
|
|
||||||
if (root == rank) {
|
|
||||||
res = s;
|
|
||||||
}
|
|
||||||
double tstart = utils::GetTime();
|
|
||||||
rabit::Broadcast(&res[0], res.length(), root);
|
|
||||||
bcast_tdiff += utils::GetTime() - tstart;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void PrintStats(const char *name, double tdiff, int n, int nrep, size_t size) {
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
double tsum = tdiff;
|
|
||||||
rabit::Allreduce<op::Sum>(&tsum, 1);
|
|
||||||
double tavg = tsum / nproc;
|
|
||||||
double tsqr = tdiff - tavg;
|
|
||||||
tsqr *= tsqr;
|
|
||||||
rabit::Allreduce<op::Sum>(&tsqr, 1);
|
|
||||||
double tstd = sqrt(tsqr / nproc);
|
|
||||||
if (rabit::GetRank() == 0) {
|
|
||||||
rabit::TrackerPrintf("%s: mean=%g, std=%g sec\n", name, tavg, tstd);
|
|
||||||
double ndata = n;
|
|
||||||
ndata *= nrep * size;
|
|
||||||
if (n != 0) {
|
|
||||||
rabit::TrackerPrintf("%s-speed: %g MB/sec\n", name, (ndata / tavg) / 1024 / 1024 );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
if (argc < 3) {
|
|
||||||
printf("Usage: <ndata> <nrepeat>\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
srand(0);
|
|
||||||
int n = atoi(argv[1]);
|
|
||||||
int nrep = atoi(argv[2]);
|
|
||||||
utils::Check(nrep >= 1, "need to at least repeat running once");
|
|
||||||
rabit::Init(argc, argv);
|
|
||||||
//int rank = rabit::GetRank();
|
|
||||||
int nproc = rabit::GetWorldSize();
|
|
||||||
std::string name = rabit::GetProcessorName();
|
|
||||||
max_tdiff = sum_tdiff = bcast_tdiff = 0;
|
|
||||||
double tstart = utils::GetTime();
|
|
||||||
for (int i = 0; i < nrep; ++i) {
|
|
||||||
TestMax(n);
|
|
||||||
TestSum(n);
|
|
||||||
TestBcast(n, rand() % nproc);
|
|
||||||
}
|
|
||||||
tot_tdiff = utils::GetTime() - tstart;
|
|
||||||
// use allreduce to get the sum and std of time
|
|
||||||
PrintStats("max_tdiff", max_tdiff, n, nrep, sizeof(float));
|
|
||||||
PrintStats("sum_tdiff", sum_tdiff, n, nrep, sizeof(float));
|
|
||||||
PrintStats("bcast_tdiff", bcast_tdiff, n, nrep, sizeof(char));
|
|
||||||
PrintStats("tot_tdiff", tot_tdiff, 0, nrep, sizeof(float));
|
|
||||||
rabit::Finalize();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@ -1,37 +0,0 @@
|
|||||||
RABIT_BUILD_DMLC = 0
|
|
||||||
|
|
||||||
ifeq ($(RABIT_BUILD_DMLC),1)
|
|
||||||
DMLC=../dmlc-core
|
|
||||||
else
|
|
||||||
DMLC=../../dmlc-core
|
|
||||||
endif
|
|
||||||
|
|
||||||
# this is a makefile used to show testcases of rabit
|
|
||||||
.PHONY: all
|
|
||||||
|
|
||||||
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k
|
|
||||||
|
|
||||||
# this experiment test recovery with actually process exit, use keepalive to keep program alive
|
|
||||||
model_recover_10_10k:
|
|
||||||
python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 rabit_bootstrap_cache=true rabit_debug=true rabit_reduce_ring_mincount=1 rabit_timeout=true rabit_timeout_sec=5
|
|
||||||
|
|
||||||
model_recover_10_10k_die_same:
|
|
||||||
python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 rabit_bootstrap_cache=1
|
|
||||||
|
|
||||||
model_recover_10_10k_die_hard:
|
|
||||||
python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 rabit_bootstrap_cache=1
|
|
||||||
|
|
||||||
local_recover_10_10k:
|
|
||||||
python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
|
||||||
|
|
||||||
pylocal_recover_10_10k:
|
|
||||||
python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
|
|
||||||
|
|
||||||
lazy_recover_10_10k_die_hard:
|
|
||||||
python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
|
|
||||||
|
|
||||||
lazy_recover_10_10k_die_same:
|
|
||||||
python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
|
|
||||||
|
|
||||||
ringallreduce_10_10k:
|
|
||||||
python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10
|
|
||||||
@ -143,7 +143,7 @@ class ColumnSampler {
|
|||||||
*/
|
*/
|
||||||
ColumnSampler() {
|
ColumnSampler() {
|
||||||
uint32_t seed = common::GlobalRandom()();
|
uint32_t seed = common::GlobalRandom()();
|
||||||
rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
|
rabit::Broadcast(&seed, sizeof(seed), 0);
|
||||||
rng_.seed(seed);
|
rng_.seed(seed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -714,8 +714,7 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
|||||||
/* sync up number of features after matrix loaded.
|
/* sync up number of features after matrix loaded.
|
||||||
* partitioned data will fail the train/val validation check
|
* partitioned data will fail the train/val validation check
|
||||||
* since partitioned data not knowing the real number of features. */
|
* since partitioned data not knowing the real number of features. */
|
||||||
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
|
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
|
||||||
nullptr, fname.c_str());
|
|
||||||
// backward compatiblity code.
|
// backward compatiblity code.
|
||||||
if (!load_row_split) {
|
if (!load_row_split) {
|
||||||
MetaInfo& info = dmat->Info();
|
MetaInfo& info = dmat->Info();
|
||||||
|
|||||||
@ -546,8 +546,7 @@ class LearnerConfiguration : public Learner {
|
|||||||
num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
|
num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
|
||||||
}
|
}
|
||||||
|
|
||||||
rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr,
|
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
|
||||||
"num_feature");
|
|
||||||
if (num_feature > mparam_.num_feature) {
|
if (num_feature > mparam_.num_feature) {
|
||||||
mparam_.num_feature = num_feature;
|
mparam_.num_feature = num_feature;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user