Cleanup RABIT. (#6290)

* Remove recovery and MPI speed tests. * Remove readme. * Remove Python binding. * Add checks in C API.
2020-10-27 08:48:22 +08:00
parent 8e0f5a6fc7
commit b180223d18
40 changed files with 113 additions and 1875 deletions
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -964,7 +964,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_RabitVersionNumber
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_RabitAllreduce
  (JNIEnv *jenv, jclass jcls, jobject jsendrecvbuf, jint jcount, jint jenum_dtype, jint jenum_op) {
  void *ptr_sendrecvbuf = jenv->GetDirectBufferAddress(jsendrecvbuf);
-  RabitAllreduce(ptr_sendrecvbuf, (size_t) jcount, jenum_dtype, jenum_op, NULL, NULL);
+  JVM_CHECK_CALL(RabitAllreduce(ptr_sendrecvbuf, (size_t) jcount, jenum_dtype, jenum_op, NULL, NULL));
  return 0;
 }
--- a/python-package/xgboost/rabit.py
+++ b/python-package/xgboost/rabit.py
@@ -120,18 +120,18 @@ def broadcast(data, root):
        s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
        length.value = len(s)
    # run first broadcast
-    _LIB.RabitBroadcast(ctypes.byref(length),
+    _check_call(_LIB.RabitBroadcast(ctypes.byref(length),
-                        ctypes.sizeof(ctypes.c_ulong), root)
+                                    ctypes.sizeof(ctypes.c_ulong), root))
    if root != rank:
        dptr = (ctypes.c_char * length.value)()
        # run second
-        _LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
+        _check_call(_LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
-                            length.value, root)
+                                        length.value, root))
        data = pickle.loads(dptr.raw)
        del dptr
    else:
-        _LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
+        _check_call(_LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
-                            length.value, root)
+                                        length.value, root))
        del s
    return data
@@ -189,18 +189,18 @@ def allreduce(data, op, prepare_fun=None):
    if buf.dtype not in DTYPE_ENUM__:
        raise Exception('data type %s not supported' % str(buf.dtype))
    if prepare_fun is None:
-        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
+        _check_call(_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
-                            buf.size, DTYPE_ENUM__[buf.dtype],
+                                        buf.size, DTYPE_ENUM__[buf.dtype],
-                            op, None, None)
+                                        op, None, None))
    else:
        func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
        def pfunc(_):
            """prepare function."""
            prepare_fun(data)
-        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
+        _check_call(_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
-                            buf.size, DTYPE_ENUM__[buf.dtype],
+                                        buf.size, DTYPE_ENUM__[buf.dtype],
-                            op, func_ptr(pfunc), None)
+                                        op, func_ptr(pfunc), None))
    return buf
--- a/rabit/.travis.yml
+++ b/rabit/.travis.yml
@@ -1,90 +0,0 @@
 sudo: true
 os:
  - linux
  - osx
 osx_image: xcode10.2
 dist: xenial
 language: cpp
 # Use Build Matrix to do lint and build seperately
 env:
  matrix:
    - TASK=lint LINT_LANG=cpp
    - TASK=lint LINT_LANG=python
    - TASK=doc
    # - TASK=build
    - TASK=mpi-build
    - TASK=cmake-test
 matrix:
  exclude:
    - os: osx
      env: TASK=lint LINT_LANG=cpp
    - os: osx
      env: TASK=lint LINT_LANG=python
    - os: osx
      env: TASK=doc
    - os: osx
      env: TASK=build
 # dependent apt packages
 addons:
  apt:
    sources:
      - llvm-toolchain-trusty-5.0
      - ubuntu-toolchain-r-test
      - george-edison55-precise-backports
    packages:
      - doxygen
      - wget
      - git
      - libcurl4-openssl-dev
      - unzip
      - python-numpy
      - gcc-4.8
      - g++-4.8
      - openssh-client
      - openssh-server
      - python3
      - python3-setuptools
      - python3-pip
      - tree
  homebrew:
    packages:
      - gcc49
      - openssl
      - libgit2
      - python3
    update: true
 before_install:
  - git clone https://github.com/dmlc/dmlc-core
  - export TRAVIS=./scripts/
  - source ${TRAVIS}/travis_setup_env.sh
  - ${TRAVIS}/travis_osx_install.sh
  - source ./scripts/travis_setup.sh
 script: scripts/travis_script.sh
 cache:
  directories:
    - ${HOME}/.cache/usr
    - ${HOME}/.cache/pip
    - mpich
 before_cache:
  - ${TRAVIS}/travis_before_cache.sh
 after_success:
  - tree build
  - bash <(curl -s https://codecov.io/bash) -a '-o src/ src/*.c'
 notifications:
 # Emails are sent to the committer's git-configured email address by default,
  email:
    on_success: change
    on_failure: always
--- a/rabit/README.md
+++ b/rabit/README.md
@@ -1,40 +1 @@
-# Rabit: Reliable Allreduce and Broadcast Interface
+# This directory contains the CPU network module for XGBoost.  The library originates from [RABIT](https://github.com/dmlc/rabit)
 [![Build Status](https://travis-ci.org/dmlc/rabit.svg?branch=master)](https://travis-ci.org/dmlc/rabit)
 [![Documentation Status](https://readthedocs.org/projects/rabit/badge/?version=latest)](http://rabit.readthedocs.org/)
 rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs.
 * [Tutorial](guide)
 * [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
 * You can also directly read the [interface header](include/rabit.h)
 * [XGBoost](https://github.com/dmlc/xgboost)
  - Rabit is one of the backbone library to support distributed XGBoost
 ## Features
 All these features comes from the facts about small rabbit:)
 * Portable: rabit is light weight and runs everywhere
  - Rabit is a library instead of a framework, a program only needs to link the library to run
  - Rabit only replies on a mechanism to start program, which was provided by most framework
  - You can run rabit programs on many platforms, including Yarn(Hadoop), MPI using the same code
 * Scalable and Flexible: rabit runs fast
  * Rabit program use Allreduce to communicate, and do not suffer the cost between iterations of MapReduce abstraction.
  - Programs can call rabit functions in any order, as opposed to frameworks where callbacks are offered and called by the framework, i.e. inversion of control principle.
  - Programs persist over all the iterations, unless they fail and recover.
 * Reliable: rabit dig burrows to avoid disasters
  - Rabit programs can recover the model and results using synchronous function calls.
  - Rabit programs can set rabit_boostrap_cache=1 to support allreduce/broadcast operations before loadcheckpoint
  `
    rabit::Init(); -> rabit::AllReduce(); -> rabit::loadCheckpoint(); -> for () { rabit::AllReduce(); rabit::Checkpoint();} -> rabit::Shutdown();
  `
 ## Use Rabit
 * Type make in the root folder will compile the rabit library in lib folder
 * Add lib to the library path and include to the include path of compiler
 * Languages: You can use rabit in C++ and python
  - It is also possible to port the library to other languages
 ## Contributing
 Rabit is an open-source library, contributions are welcomed, including:
 * The rabit core library.
 * Customized tracker script for new platforms and interface of new languages.
 * Tutorial and examples about the library.
--- a/rabit/cmake/Config.cmake.in
+++ b/rabit/cmake/Config.cmake.in
@@ -1,4 +0,0 @@
@PACKAGE_INIT@
 include("${CMAKE_CURRENT_LIST_DIR}/@TARGETS_EXPORT_NAME@.cmake")
 check_required_components("@PROJECT_NAME@")
--- a/rabit/cmake/googletest-download.cmake
+++ b/rabit/cmake/googletest-download.cmake
@@ -1,20 +0,0 @@
 # code copied from https://crascit.com/2015/07/25/cmake-gtest/
 cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
 project(googletest-download NONE)
 include(ExternalProject)
 ExternalProject_Add(
        googletest
        SOURCE_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-src"
        BINARY_DIR "@GOOGLETEST_DOWNLOAD_ROOT@/googletest-build"
        GIT_REPOSITORY
        https://github.com/google/googletest.git
        GIT_TAG
        release-1.8.0
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
        INSTALL_COMMAND ""
        TEST_COMMAND ""
 )
--- a/rabit/cmake/googletest.cmake
+++ b/rabit/cmake/googletest.cmake
@@ -1,32 +0,0 @@
 # the following code to fetch googletest
 # is inspired by and adapted after https://crascit.com/2015/07/25/cmake-gtest/
 # download and unpack googletest at configure time
 macro(fetch_googletest _download_module_path _download_root)
    set(GOOGLETEST_DOWNLOAD_ROOT ${_download_root})
    configure_file(
            ${_download_module_path}/googletest-download.cmake
            ${_download_root}/CMakeLists.txt
            @ONLY
    )
    unset(GOOGLETEST_DOWNLOAD_ROOT)
    execute_process(
            COMMAND
            "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
            WORKING_DIRECTORY
            ${_download_root}
    )
    execute_process(
            COMMAND
            "${CMAKE_COMMAND}" --build .
            WORKING_DIRECTORY
            ${_download_root}
    )
    # adds the targers: gtest, gtest_main, gmock, gmock_main
    add_subdirectory(
            ${_download_root}/googletest-src
            ${_download_root}/googletest-build
    )
 endmacro()
--- a/rabit/doc/python-requirements.txt
+++ b/rabit/doc/python-requirements.txt
@@ -1,4 +1,3 @@
 numpy
 breathe
 commonmark
--- a/rabit/include/rabit/c_api.h
+++ b/rabit/include/rabit/c_api.h
@@ -41,7 +41,7 @@ RABIT_DLL bool RabitInit(int argc, char *argv[]);
 * call this function after you finished all jobs.
 * \return true if rabit is initialized successfully otherwise false
 */
-RABIT_DLL bool RabitFinalize(void);
+RABIT_DLL int RabitFinalize(void);
 /*!
 * \brief get rank of previous process in ring topology
@@ -91,8 +91,7 @@ RABIT_DLL void RabitGetProcessorName(char *out_name,
 * \param size the size of the data
 * \param root the root of process
 */
-RABIT_DLL void RabitBroadcast(void *sendrecv_data,
+RABIT_DLL int RabitBroadcast(void *sendrecv_data, rbt_ulong size, int root);
                              rbt_ulong size, int root);
 /*!
 * \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
@@ -110,12 +109,9 @@ RABIT_DLL void RabitBroadcast(void *sendrecv_data,
 * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
 * \sa ReturnType
 */
-RABIT_DLL void RabitAllgather(void *sendrecvbuf,
+RABIT_DLL int RabitAllgather(void *sendrecvbuf, size_t total_size,
-                                  size_t total_size,
+                             size_t beginIndex, size_t size_node_slice,
-                                  size_t beginIndex,
+                             size_t size_prev_slice, int enum_dtype);
                                  size_t size_node_slice,
                                  size_t size_prev_slice,
                                  int enum_dtype);
 /*!
 * \brief perform in-place allreduce, on sendrecvbuf
@@ -133,14 +129,11 @@ RABIT_DLL void RabitAllgather(void *sendrecvbuf,
 * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
 *                    will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
 *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to passed into the lazy preprocessing function
+ * \param prepare_arg argument used to passed into the lazy preprocessing function
-   */
+ */
-RABIT_DLL void RabitAllreduce(void *sendrecvbuf,
+RABIT_DLL int RabitAllreduce(void *sendrecvbuf, size_t count, int enum_dtype,
-                              size_t count,
+                             int enum_op, void (*prepare_fun)(void *arg),
-                              int enum_dtype,
+                             void *prepare_arg);
                              int enum_op,
                              void (*prepare_fun)(void *arg),
                              void *prepare_arg);
 /*!
 * \brief load latest check point
--- a/rabit/include/rabit/internal/engine.h
+++ b/rabit/include/rabit/internal/engine.h
@@ -9,16 +9,6 @@
 #include <string>
 #include "rabit/serializable.h"
 #if (defined(__GNUC__) && !defined(__clang__))
 #define _FILE  __builtin_FILE()
 #define _LINE  __builtin_LINE()
 #define _CALLER  __builtin_FUNCTION()
 #else
 #define _FILE  "N/A"
 #define _LINE  -1
 #define _CALLER  "N/A"
 #endif  // (defined(__GNUC__) && !defined(__clang__))
 namespace MPI {  // NOLINT
 /*! \brief MPI data type just to be compatible with MPI reduce function*/
 class Datatype;
@@ -65,18 +55,12 @@ class IEngine {
   * \param slice_begin beginning of the current slice
   * \param slice_end end of the current slice
   * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
  virtual void Allgather(void *sendrecvbuf,
                         size_t total_size,
                         size_t slice_begin,
                         size_t slice_end,
-                         size_t size_prev_slice,
+                         size_t size_prev_slice) = 0;
                         const char* _file = _FILE,
                         const int _line = _LINE,
                         const char* _caller = _CALLER) = 0;
  /*!
   * \brief performs in-place Allreduce, on sendrecvbuf
   *        this function is NOT thread-safe
@@ -88,38 +72,20 @@ class IEngine {
   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
   * \param prepare_arg argument used to pass into the lazy preprocessing function
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
  virtual void Allreduce(void *sendrecvbuf_,
                         size_t type_nbytes,
                         size_t count,
                         ReduceFunction reducer,
                         PreprocFunction prepare_fun = nullptr,
-                         void *prepare_arg = nullptr,
+                         void *prepare_arg = nullptr) = 0;
                         const char* _file = _FILE,
                         const int _line = _LINE,
                         const char* _caller = _CALLER) = 0;
  /*!
   * \brief broadcasts data from root to every other node
   * \param sendrecvbuf_ buffer for both sending and receiving data
   * \param size the size of the data to be broadcasted
   * \param root the root worker id to broadcast the data
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
-  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root,
+  virtual void Broadcast(void *sendrecvbuf_, size_t size, int root) = 0;
                         const char* _file = _FILE,
                         const int _line = _LINE,
                         const char* _caller = _CALLER) = 0;
  /*!
   * \brief explicitly re-initialize everything before calling LoadCheckPoint
   *    call this function when IEngine throws an exception,
   *    this function should only be used for test purposes
   */
  virtual void InitAfterException() = 0;
  /*!
   * \brief loads the latest check point
   * \param global_model pointer to the globally shared model/state
@@ -250,18 +216,12 @@ enum DataType {
 * \param slice_begin beginning of the current slice
 * \param slice_end end of the current slice
 * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
 * \param _file caller file name used to generate unique cache key
 * \param _line caller line number used to generate unique cache key
 * \param _caller caller function name used to generate unique cache key
 */
 void Allgather(void* sendrecvbuf,
                   size_t total_size,
                   size_t slice_begin,
                   size_t slice_end,
-                   size_t size_prev_slice,
+                   size_t size_prev_slice);
                   const char* _file = _FILE,
                   const int _line = _LINE,
                   const char* _caller = _CALLER);
 /*!
 * \brief perform in-place Allreduce, on sendrecvbuf
 *   this is an internal function used by rabit to be able to compile with MPI
@@ -276,9 +236,6 @@ void Allgather(void* sendrecvbuf,
 *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf_.
 *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
 * \param prepare_arg argument used to pass into the lazy preprocessing function.
 * \param _file caller file name used to generate unique cache key
 * \param _line caller line number used to generate unique cache key
 * \param _caller caller function name used to generate unique cache key
 */
 void Allreduce_(void *sendrecvbuf,  // NOLINT
                size_t type_nbytes,
@@ -287,10 +244,7 @@ void Allreduce_(void *sendrecvbuf,  // NOLINT
                mpi::DataType dtype,
                mpi::OpType op,
                IEngine::PreprocFunction prepare_fun = nullptr,
-                void *prepare_arg = nullptr,
+                void *prepare_arg = nullptr);
                const char* _file = _FILE,
                const int _line = _LINE,
                const char* _caller = _CALLER);
 /*!
 * \brief handle for customized reducer, used to handle customized reduce
 *  this class is mainly created for compatiblity issues with MPI's customized reduce
@@ -316,18 +270,13 @@ class ReduceHandle {
   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
   * \param prepare_arg argument used to pass into the lazy preprocessing function
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
  void Allreduce(void *sendrecvbuf,
                 size_t type_nbytes,
                 size_t count,
                 IEngine::PreprocFunction prepare_fun = nullptr,
-                 void *prepare_arg = nullptr,
+                 void *prepare_arg = nullptr);
-                 const char* _file = _FILE,
+
                 const int _line = _LINE,
                 const char* _caller = _CALLER);
  /*! \return the number of bytes occupied by the type */
  static int TypeSize(const MPI::Datatype &dtype);
--- a/rabit/include/rabit/internal/rabit-inl.h
+++ b/rabit/include/rabit/internal/rabit-inl.h
@@ -131,40 +131,28 @@ inline std::string GetProcessorName() {
  return engine::GetEngine()->GetHost();
 }
 // broadcast data to all other nodes from root
-inline void Broadcast(void *sendrecv_data, size_t size, int root,
+inline void Broadcast(void *sendrecv_data, size_t size, int root) {
-                      const char* _file,
+  engine::GetEngine()->Broadcast(sendrecv_data, size, root);
                      const int _line,
                      const char* _caller) {
  engine::GetEngine()->Broadcast(sendrecv_data, size, root,
    _file, _line, _caller);
 }
 template<typename DType>
-inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
+inline void Broadcast(std::vector<DType> *sendrecv_data, int root) {
                      const char* _file,
                      const int _line,
                      const char* _caller) {
  size_t size = sendrecv_data->size();
-  Broadcast(&size, sizeof(size), root, _file, _line, _caller);
+  Broadcast(&size, sizeof(size), root);
  if (sendrecv_data->size() != size) {
    sendrecv_data->resize(size);
  }
  if (size != 0) {
-    Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root,
+    Broadcast(&(*sendrecv_data)[0], size * sizeof(DType), root);
      _file, _line, _caller);
  }
 }
-inline void Broadcast(std::string *sendrecv_data, int root,
+inline void Broadcast(std::string *sendrecv_data, int root) {
                      const char* _file,
                      const int _line,
                      const char* _caller) {
  size_t size = sendrecv_data->length();
-  Broadcast(&size, sizeof(size), root, _file, _line, _caller);
+  Broadcast(&size, sizeof(size), root);
  if (sendrecv_data->length() != size) {
    sendrecv_data->resize(size);
  }
  if (size != 0) {
-    Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root,
+    Broadcast(&(*sendrecv_data)[0], size * sizeof(char), root);
    _file, _line, _caller);
  }
 }
@@ -172,13 +160,9 @@ inline void Broadcast(std::string *sendrecv_data, int root,
 template<typename OP, typename DType>
 inline void Allreduce(DType *sendrecvbuf, size_t count,
                      void (*prepare_fun)(void *arg),
-                      void *prepare_arg,
+                      void *prepare_arg) {
                      const char* _file,
                      const int _line,
                      const char* _caller) {
  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
-                     engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg,
+                     engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg);
                     _file, _line, _caller);
 }
 // C++11 support for lambda prepare function
@@ -188,13 +172,9 @@ inline void InvokeLambda(void *fun) {
 }
 template<typename OP, typename DType>
 inline void Allreduce(DType *sendrecvbuf, size_t count,
-                      std::function<void()> prepare_fun,
+                      std::function<void()> prepare_fun) {
                      const char* _file,
                      const int _line,
                      const char* _caller) {
  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
-                     engine::mpi::GetType<DType>(), OP::kType, InvokeLambda, &prepare_fun,
+                     engine::mpi::GetType<DType>(), OP::kType, InvokeLambda, &prepare_fun);
                     _file, _line, _caller);
 }
 // Performs inplace Allgather
@@ -203,13 +183,10 @@ inline void Allgather(DType *sendrecvbuf,
                      size_t totalSize,
                      size_t beginIndex,
                      size_t sizeNodeSlice,
-                      size_t sizePrevSlice,
+                      size_t sizePrevSlice) {
                      const char* _file,
                      const int _line,
                      const char* _caller) {
  engine::GetEngine()->Allgather(sendrecvbuf, totalSize * sizeof(DType), beginIndex * sizeof(DType),
                        (beginIndex + sizeNodeSlice) * sizeof(DType),
-                        sizePrevSlice * sizeof(DType), _file, _line, _caller);
+                        sizePrevSlice * sizeof(DType));
 }
 #endif  // C++11
@@ -289,12 +266,9 @@ inline Reducer<DType, freduce>::Reducer() {
 template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
 inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
                                               void (*prepare_fun)(void *arg),
-                                               void *prepare_arg,
+                                               void *prepare_arg) {
                                               const char* _file,
                                               const int _line,
                                               const char* _caller) {
  handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun,
-    prepare_arg, _file, _line, _caller);
+                    prepare_arg);
 }
 // function to perform reduction for SerializeReducer
 template<typename DType>
@@ -342,10 +316,7 @@ template<typename DType>
 inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
                                               size_t max_nbyte, size_t count,
                                               void (*prepare_fun)(void *arg),
-                                               void *prepare_arg,
+                                               void *prepare_arg) {
                                               const char* _file,
                                               const int _line,
                                               const char* _caller) {
  buffer_.resize(max_nbyte * count);
  // setup closure
  SerializeReduceClosure<DType> c;
@@ -353,34 +324,23 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
  c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;
  // invoke here
  handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
-                    SerializeReduceClosure<DType>::Invoke, &c,
+                    SerializeReduceClosure<DType>::Invoke, &c);
                    _file, _line, _caller);
  for (size_t i = 0; i < count; ++i) {
    utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
    sendrecvobj[i].Load(fs);
  }
 }
 #if DMLC_USE_CXX11
 template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)g
 inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
-                                               std::function<void()> prepare_fun,
+                                               std::function<void()> prepare_fun) {
-                                               const char* _file,
+  this->Allreduce(sendrecvbuf, count, InvokeLambda, &prepare_fun);
                                               const int _line,
                                               const char* _caller) {
  this->Allreduce(sendrecvbuf, count, InvokeLambda, &prepare_fun,
    _file, _line, _caller);
 }
 template<typename DType>
 inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
                                               size_t max_nbytes, size_t count,
-                                               std::function<void()> prepare_fun,
+                                               std::function<void()> prepare_fun) {
-                                               const char* _file,
+  this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda, &prepare_fun);
                                               const int _line,
                                               const char* _caller) {
  this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda, &prepare_fun,
    _file, _line, _caller);
 }
 #endif  // DMLC_USE_CXX11
 }  // namespace rabit
 #endif  // RABIT_INTERNAL_RABIT_INL_H_
--- a/rabit/include/rabit/internal/timer.h
+++ b/rabit/include/rabit/internal/timer.h
@@ -1,41 +0,0 @@
 /*!
 * Copyright (c) 2014-2019 by Contributors
 * \file timer.h
 * \brief This file defines the utils for timing
 * \author Tianqi Chen, Nacho, Tianyi
 */
 #ifndef RABIT_INTERNAL_TIMER_H_
 #define RABIT_INTERNAL_TIMER_H_
 #include <ctime>
 #ifdef __MACH__
 #include <mach/clock.h>
 #include <mach/mach.h>
 #endif  // __MACH__
 #include "./utils.h"
 namespace rabit {
 namespace utils {
 /*!
 * \brief return time in seconds, not cross platform, avoid to use this in most places
 */
 inline double GetTime() {
 #ifdef __MACH__
  clock_serv_t cclock;
  mach_timespec_t mts;
  host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
  utils::Check(clock_get_time(cclock, &mts) == 0, "failed to get time");
  mach_port_deallocate(mach_task_self(), cclock);
  return static_cast<double>(mts.tv_sec) + static_cast<double>(mts.tv_nsec) * 1e-9;
 #else
 #if defined(__unix__) || defined(__linux__)
  timespec ts;
  utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
  return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
 #else
  return static_cast<double>(time(NULL));
 #endif  // defined(__unix__) || defined(__linux__)
 #endif  // __MACH__
 }
 }  // namespace utils
 }  // namespace rabit
 #endif  // RABIT_INTERNAL_TIMER_H_
--- a/rabit/include/rabit/internal/utils.h
+++ b/rabit/include/rabit/internal/utils.h
@@ -69,35 +69,10 @@ inline bool StringToBool(const char* s) {
  return CompareStringsCaseInsensitive(s, "true") == 0 || atoi(s) != 0;
 }
 /*!
 * \brief handling of Assert error, caused by inappropriate input
 * \param msg error message
 */
 inline void HandleAssertError(const char *msg) {
  LOG(FATAL) << msg;
 }
 /*!
 * \brief handling of Check error, caused by inappropriate input
 * \param msg error message
 */
 inline void HandleCheckError(const char *msg) {
  LOG(FATAL) << msg;
 }
 inline void HandlePrint(const char *msg) {
  printf("%s", msg);
 }
 inline void HandleLogInfo(const char *fmt, ...) {
  std::string msg(kPrintBuffer, '\0');
  va_list args;
  va_start(args, fmt);
  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
  va_end(args);
  fprintf(stdout, "%s", msg.c_str());
  fflush(stdout);
 }
 /*! \brief printf, prints messages to the console */
 inline void Printf(const char *fmt, ...) {
  std::string msg(kPrintBuffer, '\0');
@@ -108,15 +83,6 @@ inline void Printf(const char *fmt, ...) {
  HandlePrint(msg.c_str());
 }
 /*! \brief portable version of snprintf */
 inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
  va_list args;
  va_start(args, fmt);
  int ret = vsnprintf(buf, size, fmt, args);
  va_end(args);
  return ret;
 }
 /*! \brief assert a condition is true, use this to handle debug information */
 inline void Assert(bool exp, const char *fmt, ...) {
  if (!exp) {
@@ -125,7 +91,7 @@ inline void Assert(bool exp, const char *fmt, ...) {
    va_start(args, fmt);
    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
    va_end(args);
-    HandleAssertError(msg.c_str());
+    LOG(FATAL) << msg;
  }
 }
@@ -137,7 +103,7 @@ inline void Check(bool exp, const char *fmt, ...) {
    va_start(args, fmt);
    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
    va_end(args);
-    HandleCheckError(msg.c_str());
+    LOG(FATAL) << msg;
  }
 }
@@ -149,7 +115,7 @@ inline void Error(const char *fmt, ...) {
    va_start(args, fmt);
    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
    va_end(args);
-    HandleCheckError(msg.c_str());
+    LOG(FATAL) << msg;
  }
 }
 }  // namespace utils
@@ -176,15 +142,6 @@ inline T *BeginPtr(std::vector<T> &vec) {  // NOLINT(*)
    return &vec[0];
  }
 }
 /*! \brief get the beginning address of a vector */
 template<typename T>
 inline const T *BeginPtr(const std::vector<T> &vec) {  // NOLINT(*)
  if (vec.size() == 0) {
    return nullptr;
  } else {
    return &vec[0];
  }
 }
 inline char* BeginPtr(std::string &str) {  // NOLINT(*)
  if (str.length() == 0) return nullptr;
  return &str[0];
--- a/rabit/include/rabit/rabit.h
+++ b/rabit/include/rabit/rabit.h
@@ -12,36 +12,7 @@
 #define RABIT_RABIT_H_  // NOLINT(*)
 #include <string>
 #include <vector>
 // whether or not use c++11 support
 #ifndef DMLC_USE_CXX11
 #if defined(__GXX_EXPERIMENTAL_CXX0X__) || defined(_MSC_VER)
 #define DMLC_USE_CXX11 1
 #else
 #define DMLC_USE_CXX11 (__cplusplus >= 201103L)
 #endif  // defined(__GXX_EXPERIMENTAL_CXX0X__) || defined(_MSC_VER)
 #endif  // DMLC_USE_CXX11
 // keeps rabit api caller signature
 #ifndef RABIT_API_CALLER_SIGNATURE
 #define RABIT_API_CALLER_SIGNATURE
 #if (defined(__GNUC__) && !defined(__clang__))
 #define _FILE  __builtin_FILE()
 #define _LINE  __builtin_LINE()
 #define _CALLER  __builtin_FUNCTION()
 #else
 #define _FILE  "N/A"
 #define _LINE  -1
 #define _CALLER  "N/A"
 #endif  // (defined(__GNUC__) && !defined(__clang__))
 #endif  // RABIT_API_CALLER_SIGNATURE
 // optionally support of lambda functions in C++11, if available
 #if DMLC_USE_CXX11
 #include <functional>
 #endif  // C++11
 // engine definition of rabit, defines internal implementation
 // to use rabit interface, there is no need to read engine.h
 // rabit.h and serializable.h are enough to use the interface
@@ -135,31 +106,19 @@ inline void TrackerPrintf(const char *fmt, ...);
 * \param sendrecv_data the pointer to the send/receive buffer,
 * \param size the data size
 * \param root the process root
 * \param _file caller file name used to generate unique cache key
 * \param _line caller line number used to generate unique cache key
 * \param _caller caller function name used to generate unique cache key
 */
-inline void Broadcast(void *sendrecv_data, size_t size, int root,
+inline void Broadcast(void *sendrecv_data, size_t size, int root);
                      const char* _file = _FILE,
                      const int _line = _LINE,
                      const char* _caller = _CALLER);
 /*!
 * \brief broadcasts an std::vector<DType> to every node from root
 * \param sendrecv_data the pointer to send/receive vector,
 *        for the receiver, the vector does not need to be pre-allocated
 * \param root the process root
 * \param _file caller file name used to generate unique cache key
 * \param _line caller line number used to generate unique cache key
 * \param _caller caller function name used to generate unique cache key
 * \tparam DType the data type stored in the vector, has to be a simple data type
 *               that can be directly transmitted by sending the sizeof(DType)
 */
 template<typename DType>
-inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
+inline void Broadcast(std::vector<DType> *sendrecv_data, int root);
                      const char* _file = _FILE,
                      const int _line = _LINE,
                      const char* _caller = _CALLER);
 /*!
 * \brief broadcasts a std::string to every node from the root
 * \param sendrecv_data the pointer to the send/receive buffer,
@@ -169,10 +128,7 @@ inline void Broadcast(std::vector<DType> *sendrecv_data, int root,
 * \param _caller caller function name used to generate unique cache key
 * \param root the process root
 */
-inline void Broadcast(std::string *sendrecv_data, int root,
+inline void Broadcast(std::string *sendrecv_data, int root);
                      const char* _file = _FILE,
                      const int _line = _LINE,
                      const char* _caller = _CALLER);
 /*!
 * \brief performs in-place Allreduce on sendrecvbuf
 *        this function is NOT thread-safe
@@ -191,19 +147,13 @@ inline void Broadcast(std::string *sendrecv_data, int root,
 *                    will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
 *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
 * \param prepare_arg argument used to pass into the lazy preprocessing function
 * \param _file caller file name used to generate unique cache key
 * \param _line caller line number used to generate unique cache key
 * \param _caller caller function name used to generate unique cache key
 * \tparam OP see namespace op, reduce operator
 * \tparam DType data type
 */
 template<typename OP, typename DType>
 inline void Allreduce(DType *sendrecvbuf, size_t count,
                      void (*prepare_fun)(void *) = nullptr,
-                      void *prepare_arg = nullptr,
+                      void *prepare_arg = nullptr);
                      const char* _file = _FILE,
                      const int _line = _LINE,
                      const char* _caller = _CALLER);
 /*!
 * \brief Allgather function, each node have a segment of data in the ring of sendrecvbuf,
@@ -217,19 +167,13 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
 * \param slice_begin beginning of the current slice
 * \param slice_end end of the current slice
 * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
 * \param _file caller file name used to generate unique cache key
 * \param _line caller line number used to generate unique cache key
 * \param _caller caller function name used to generate unique cache key
 */
 template<typename DType>
 inline void Allgather(DType *sendrecvbuf_,
                  size_t total_size,
                  size_t slice_begin,
                  size_t slice_end,
-                  size_t size_prev_slice,
+                  size_t size_prev_slice);
                  const char* _file = _FILE,
                  const int _line = _LINE,
                  const char* _caller = _CALLER);
 // C++11 support for lambda prepare function
 #if DMLC_USE_CXX11
@@ -254,18 +198,12 @@ inline void Allgather(DType *sendrecvbuf_,
 * \param prepare_fun  Lazy lambda preprocessing function, prepare_fun() will be invoked
 *                     by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
 *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
 * \param _file caller file name used to generate unique cache key
 * \param _line caller line number used to generate unique cache key
 * \param _caller caller function name used to generate unique cache key
 * \tparam OP see namespace op, reduce operator
 * \tparam DType data type
 */
 template<typename OP, typename DType>
 inline void Allreduce(DType *sendrecvbuf, size_t count,
-                      std::function<void()> prepare_fun,
+                      std::function<void()> prepare_fun);
                      const char* _file = _FILE,
                      const int _line = _LINE,
                      const char* _caller = _CALLER);
 #endif  // C++11
 /*!
 * \brief loads the latest check point
@@ -361,31 +299,19 @@ class Reducer {
   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
   * \param prepare_arg argument used to pass into the lazy preprocessing function
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
  inline void Allreduce(DType *sendrecvbuf, size_t count,
                        void (*prepare_fun)(void *) = nullptr,
-                        void *prepare_arg = nullptr,
+                        void *prepare_arg = nullptr);
                        const char* _file = _FILE,
                        const int _line = _LINE,
                        const char* _caller = _CALLER);
 #if DMLC_USE_CXX11
  /*!
   * \brief customized in-place all reduce operation, with lambda function as preprocessor
   * \param sendrecvbuf pointer to the array of objects to be reduced
   * \param count number of elements to be reduced
   * \param prepare_fun lambda function executed to prepare the data, if necessary
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
  inline void Allreduce(DType *sendrecvbuf, size_t count,
-                        std::function<void()> prepare_fun,
+                        std::function<void()> prepare_fun);
                        const char* _file = _FILE,
                        const int _line = _LINE,
                        const char* _caller = _CALLER);
 #endif  // DMLC_USE_CXX11
 private:
@@ -416,17 +342,11 @@ class SerializeReducer {
   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
   *                     If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
   * \param prepare_arg argument used to pass into the lazy preprocessing function
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
  inline void Allreduce(DType *sendrecvobj,
                        size_t max_nbyte, size_t count,
                        void (*prepare_fun)(void *) = nullptr,
-                        void *prepare_arg = nullptr,
+                        void *prepare_arg = nullptr);
                        const char* _file = _FILE,
                        const int _line = _LINE,
                        const char* _caller = _CALLER);
 // C++11 support for lambda prepare function
 #if DMLC_USE_CXX11
  /*!
@@ -436,16 +356,10 @@ class SerializeReducer {
   *        this includes budget limit for intermediate and final result
   * \param count number of elements to be reduced
   * \param prepare_fun lambda function executed to prepare the data, if necessary
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
  inline void Allreduce(DType *sendrecvobj,
                        size_t max_nbyte, size_t count,
-                        std::function<void()> prepare_fun,
+                        std::function<void()> prepare_fun);
                        const char* _file = _FILE,
                        const int _line = _LINE,
                        const char* _caller = _CALLER);
 #endif  // DMLC_USE_CXX11
 private:
--- a/rabit/lib/README.md
+++ b/rabit/lib/README.md
@@ -1,15 +0,0 @@
 Rabit Library
 =====
 This folder holds the library file generated by the compiler. To generate the library file, type ```make``` in the project root folder. If you want mpi compatible library, type ```make mpi```
 ***List of Files***
 * rabit.a The rabit package library
  - Normally you need to link with this one
 * rabit_mock.a The rabit package library with mock test
  - This library allows additional mock-test
 * rabit_mpi.a The MPI backed library
  - Link against this library makes the program use MPI Allreduce
  - This library is not fault-tolerant
 * rabit_empty.a Dummy package implementation
  - This is an empty library that does not provide anything
  - Only introduced to minimize code dependency for projects that only need single machine code
--- a/rabit/python/rabit.py
+++ b/rabit/python/rabit.py
@@ -1,364 +0,0 @@
 """
 Reliable Allreduce and Broadcast Library.
 Author: Tianqi Chen
 """
 # pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
 import pickle
 import ctypes
 import os
 import platform
 import sys
 import warnings
 import numpy as np
 # version information about the doc
 __version__ = '1.0'
 _LIB = None
 def _find_lib_path(dll_name):
    """Find the rabit dynamic library files.
    Returns
    -------
    lib_path: list(string)
       List of all found library path to rabit
    """
    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
    # make pythonpack hack: copy this directory one level upper for setup.py
    dll_path = [curr_path,
                os.path.join(curr_path, '../lib/'),
                os.path.join(curr_path, './lib/')]
    if os.name == 'nt':
        dll_path = [os.path.join(p, dll_name) for p in dll_path]
    else:
        dll_path = [os.path.join(p, dll_name) for p in dll_path]
    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
    #From github issues, most of installation errors come from machines w/o compilers
    if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
        raise RuntimeError(
            'Cannot find Rabit Libarary in the candicate path, ' +
            'did you install compilers and run build.sh in root path?\n'
            'List of candidates:\n' + ('\n'.join(dll_path)))
    return lib_path
 # load in xgboost library
 def _loadlib(lib='standard', lib_dll=None):
    """Load rabit library."""
    global _LIB
    if _LIB is not None:
        warnings.warn('rabit.int call was ignored because it has'\
                          ' already been initialized', level=2)
        return
    if lib_dll is not None:
        _LIB = lib_dll
        return
    if lib == 'standard':
        dll_name = 'librabit'
    else:
        dll_name = 'librabit_' + lib
    if os.name == 'nt':
        dll_name += '.dll'
    elif platform.system() == 'Darwin':
        dll_name += '.dylib'
    else:
        dll_name += '.so'
    _LIB = ctypes.cdll.LoadLibrary(_find_lib_path(dll_name)[0])
    _LIB.RabitGetRank.restype = ctypes.c_int
    _LIB.RabitGetWorldSize.restype = ctypes.c_int
    _LIB.RabitVersionNumber.restype = ctypes.c_int
 def _unloadlib():
    """Unload rabit library."""
    global _LIB
    del _LIB
    _LIB = None
 # reduction operators
 MAX = 0
 MIN = 1
 SUM = 2
 BITOR = 3
 def init(args=None, lib='standard', lib_dll=None):
    """Intialize the rabit module, call this once before using anything.
    Parameters
    ----------
    args: list of str, optional
        The list of arguments used to initialized the rabit
        usually you need to pass in sys.argv.
        Defaults to sys.argv when it is None.
    lib: {'standard', 'mock', 'mpi'}, optional
        Type of library we want to load
        When cdll is specified
    lib_dll: ctypes.DLL, optional
        The DLL object used as lib.
        When this is presented argument lib will be ignored.
    """
    if args is None:
        args = []
    _loadlib(lib, lib_dll)
    arr = (ctypes.c_char_p * len(args))()
    arr[:] = args
    _LIB.RabitInit(len(args), arr)
 def finalize():
    """Finalize the rabit engine.
    Call this function after you finished all jobs.
    """
    _LIB.RabitFinalize()
    _unloadlib()
 def get_rank():
    """Get rank of current process.
    Returns
    -------
    rank : int
        Rank of current process.
    """
    ret = _LIB.RabitGetRank()
    return ret
 def get_world_size():
    """Get total number workers.
    Returns
    -------
    n : int
        Total number of process.
    """
    ret = _LIB.RabitGetWorldSize()
    return ret
 def tracker_print(msg):
    """Print message to the tracker.
    This function can be used to communicate the information of
    the progress to the tracker
    Parameters
    ----------
    msg : str
        The message to be printed to tracker.
    """
    if not isinstance(msg, str):
        msg = str(msg)
    _LIB.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
 def get_processor_name():
    """Get the processor name.
    Returns
    -------
    name : str
        the name of processor(host)
    """
    mxlen = 256
    length = ctypes.c_ulong()
    buf = ctypes.create_string_buffer(mxlen)
    _LIB.RabitGetProcessorName(buf, ctypes.byref(length), mxlen)
    return buf.value
 def broadcast(data, root):
    """Broadcast object from one node to all other nodes.
    Parameters
    ----------
    data : any type that can be pickled
        Input data, if current rank does not equal root, this can be None
    root : int
        Rank of the node to broadcast data from.
    Returns
    -------
    object : int
        the result of broadcast.
    """
    rank = get_rank()
    length = ctypes.c_ulong()
    if root == rank:
        assert data is not None, 'need to pass in data when broadcasting'
        s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
        length.value = len(s)
    # run first broadcast
    _LIB.RabitBroadcast(ctypes.byref(length),
                        ctypes.sizeof(ctypes.c_ulong), root)
    if root != rank:
        dptr = (ctypes.c_char * length.value)()
        # run second
        _LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
                            length.value, root)
        data = pickle.loads(dptr.raw)
        del dptr
    else:
        _LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
                            length.value, root)
        del s
    return data
 # enumeration of dtypes
 DTYPE_ENUM__ = {
    np.dtype('int8') : 0,
    np.dtype('uint8') : 1,
    np.dtype('int32') : 2,
    np.dtype('uint32') : 3,
    np.dtype('int64') : 4,
    np.dtype('uint64') : 5,
    np.dtype('float32') : 6,
    np.dtype('float64') : 7
 }
 def allreduce(data, op, prepare_fun=None):
    """Perform allreduce, return the result.
    Parameters
    ----------
    data: numpy array
        Input data.
    op: int
        Reduction operators, can be MIN, MAX, SUM, BITOR
    prepare_fun: function
        Lazy preprocessing function, if it is not None, prepare_fun(data)
        will be called by the function before performing allreduce, to intialize the data
        If the result of Allreduce can be recovered directly,
        then prepare_fun will NOT be called
    Returns
    -------
    result : array_like
        The result of allreduce, have same shape as data
    Notes
    -----
    This function is not thread-safe.
    """
    if not isinstance(data, np.ndarray):
        raise Exception('allreduce only takes in numpy.ndarray')
    buf = data.ravel()
    if buf.base is data.base:
        buf = buf.copy()
    if buf.dtype not in DTYPE_ENUM__:
        raise Exception('data type %s not supported' % str(buf.dtype))
    if prepare_fun is None:
        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
                            buf.size, DTYPE_ENUM__[buf.dtype],
                            op, None, None)
    else:
        func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
        def pfunc(args):
            """prepare function."""
            prepare_fun(data)
        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
                            buf.size, DTYPE_ENUM__[buf.dtype],
                            op, func_ptr(pfunc), None)
    return buf
 def _load_model(ptr, length):
    """
    Internal function used by the module,
    unpickle a model from a buffer specified by ptr, length
    Arguments:
        ptr: ctypes.POINTER(ctypes._char)
            pointer to the memory region of buffer
        length: int
            the length of buffer
    """
    data = (ctypes.c_char * length).from_address(ctypes.addressof(ptr.contents))
    return pickle.loads(data.raw)
 def load_checkpoint(with_local=False):
    """Load latest check point.
    Parameters
    ----------
    with_local: bool, optional
        whether the checkpoint contains local model
    Returns
    -------
    tuple : tuple
        if with_local: return (version, gobal_model, local_model)
        else return (version, gobal_model)
        if returned version == 0, this means no model has been CheckPointed
        and global_model, local_model returned will be None
    """
    gptr = ctypes.POINTER(ctypes.c_char)()
    global_len = ctypes.c_ulong()
    if with_local:
        lptr = ctypes.POINTER(ctypes.c_char)()
        local_len = ctypes.c_ulong()
        version = _LIB.RabitLoadCheckPoint(
            ctypes.byref(gptr),
            ctypes.byref(global_len),
            ctypes.byref(lptr),
            ctypes.byref(local_len))
        if version == 0:
            return (version, None, None)
        return (version,
                _load_model(gptr, global_len.value),
                _load_model(lptr, local_len.value))
    else:
        version = _LIB.RabitLoadCheckPoint(
            ctypes.byref(gptr),
            ctypes.byref(global_len),
            None, None)
        if version == 0:
            return (version, None)
        return (version,
                _load_model(gptr, global_len.value))
 def checkpoint(global_model, local_model=None):
    """Checkpoint the model.
    This means we finished a stage of execution.
    Every time we call check point, there is a version number which will increase by one.
    Parameters
    ----------
    global_model: anytype that can be pickled
        globally shared model/state when calling this function,
        the caller need to gauranttees that global_model is the same in all nodes
    local_model: anytype that can be pickled
       Local model, that is specific to current node/rank.
       This can be None when no local state is needed.
    Notes
    -----
    local_model requires explicit replication of the model for fault-tolerance.
    This will bring replication cost in checkpoint function.
    while global_model do not need explicit replication.
    It is recommended to use global_model if possible.
    """
    sglobal = pickle.dumps(global_model)
    if local_model is None:
        _LIB.RabitCheckPoint(sglobal, len(sglobal), None, 0)
        del sglobal
    else:
        slocal = pickle.dumps(local_model)
        _LIB.RabitCheckPoint(sglobal, len(sglobal), slocal, len(slocal))
        del slocal
        del sglobal
 def version_number():
    """Returns version number of current stored model.
    This means how many calls to CheckPoint we made so far.
    Returns
    -------
    version : int
        Version number of currently stored model
    """
    ret = _LIB.RabitVersionNumber()
    return ret
--- a/rabit/scripts/mpi_build.sh
+++ b/rabit/scripts/mpi_build.sh
@@ -1,27 +0,0 @@
 #!/usr/bin/env bash
 if [ -f mpich/lib/libmpich.so ]; then
  echo "libmpich.so found -- nothing to build."
 else
  echo "Downloading mpich source."
  wget http://www.mpich.org/static/downloads/3.2/mpich-3.2.tar.gz
  tar xfz mpich-3.2.tar.gz
  rm mpich-3.2.tar.gz*
  echo "configuring and building mpich."
  cd mpich-3.2
  #CC=gcc CXX=g++ CFLAGS=-m64 CXXFLAGS=-m64 FFLAGS=-m64
  ./configure \
          --prefix=`pwd`/../mpich \
          --enable-static=false \
          --enable-alloca=true \
          --disable-long-double \
          --enable-threads=single \
          --enable-fortran=no \
          --enable-fast=all \
          --enable-g=none \
          --enable-timing=none \
          --enable-cxx
  make -j4
  make install
  cd -
 fi
--- a/rabit/scripts/travis_osx_install.sh
+++ b/rabit/scripts/travis_osx_install.sh
@@ -1,7 +0,0 @@
 #!/bin/bash
 set -e
 set -x
 if [ ${TRAVIS_OS_NAME} != "osx" ]; then
    exit 0
 fi
--- a/rabit/scripts/travis_runtest.sh
+++ b/rabit/scripts/travis_runtest.sh
@@ -1,14 +0,0 @@
 #!/bin/bash
 conda activate python3
 conda --version
 python --version
 make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k || exit -1
 make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k_die_same  || exit -1
 make -f test.mk RABIT_BUILD_DMLC=1 model_recover_10_10k_die_hard || exit -1
 make -f test.mk RABIT_BUILD_DMLC=1 local_recover_10_10k || exit -1
 make -f test.mk RABIT_BUILD_DMLC=1 lazy_recover_10_10k_die_hard || exit -1
 make -f test.mk RABIT_BUILD_DMLC=1 lazy_recover_10_10k_die_same || exit -1
 make -f test.mk RABIT_BUILD_DMLC=1 ringallreduce_10_10k || exit -1
 make -f test.mk RABIT_BUILD_DMLC=1 pylocal_recover_10_10k || exit -1
--- a/rabit/scripts/travis_script.sh
+++ b/rabit/scripts/travis_script.sh
@@ -1,36 +0,0 @@
 #!/bin/bash
 # main script of travis
 if [ ${TASK} == "lint" ]; then
    make lint RABIT_BUILD_DMLC=1 || exit -1
 fi
 if [ ${TASK} == "doc" ]; then
    make doc 2>log.txt
    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag" |grep warning) && exit -1
 fi
 # we should depreciate Makefile based build
 if [ ${TASK} == "build" ]; then
    make all RABIT_BUILD_DMLC=1 || exit -1
 fi
 if [ ${TASK} == "mpi-build" ]; then
    ./scripts/mpi_build.sh
    cd test
    make mpi RABIT_BUILD_DMLC=1 && make speed_test.mpi RABIT_BUILD_DMLC=1 || exit -1
 fi
 #
 if [ ${TASK} == "cmake-test" ]; then
    mkdir build
    cd build
    cmake -DRABIT_BUILD_TESTS=ON -DRABIT_BUILD_DMLC=ON -DGTEST_ROOT=${HOME}/.local ..
    # known osx gtest 1.8 issue
    cp ${HOME}/.local/lib/*.dylib .
    make -j$(nproc)
    make test
    make install || exit -1
    cd ../test
    ../scripts/travis_runtest.sh || exit -1
    rm -rf ../build
 fi
--- a/rabit/scripts/travis_setup.sh
+++ b/rabit/scripts/travis_setup.sh
@@ -1,50 +0,0 @@
 #!/bin/bash
 echo "Testing on: ${TRAVIS_OS_NAME}, Home directory: ${HOME}"
 # Install Miniconda
 if [ ${TRAVIS_OS_NAME} == "osx" ]; then
  wget -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 else
  wget -O conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 fi
 bash conda.sh -b -p $HOME/miniconda
 source $HOME/miniconda/bin/activate
 conda config --set always_yes yes --set changeps1 no
 conda update -q conda
 conda info -a
 conda create -n python3 python=3.7
 conda activate python3
 conda --version
 python --version
 # Install Python packages
 conda install -c conda-forge numpy scipy urllib3 websocket-client
 python -m pip install cpplint pylint kubernetes
 # Install googletest under home directory
 GTEST_VERSION=1.8.1
 GTEST_RELEASE=release-${GTEST_VERSION}.tar.gz
 GTEST_TAR_BALL=googletest_${GTEST_RELEASE}
 wget https://github.com/google/googletest/archive/${GTEST_RELEASE} -O ${GTEST_TAR_BALL}
 echo "152b849610d91a9dfa1401293f43230c2e0c33f8 ${GTEST_TAR_BALL}" | sha1sum -c
 tar -xf ${GTEST_TAR_BALL}
 pushd .
 cd googletest-release-${GTEST_VERSION}
 mkdir build
 cd build
 echo "Installing to ${HOME}/.local"
 cmake .. -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=${HOME}/.local
 make -j$(nproc)
 make install
 popd
 if [ ${TRAVIS_OS_NAME} == "linux" ]; then
    sudo apt-get install tree
 fi
 if [ ${TRAVIS_OS_NAME} == "osx" ]; then
    brew install python3
 fi
--- a/rabit/scripts/travis_setup_env.sh
+++ b/rabit/scripts/travis_setup_env.sh
@@ -1,40 +0,0 @@
 # script to be sourced in travis yml
 # setup all enviroment variables
 export CACHE_PREFIX=${HOME}/.cache/usr
 export PATH=${HOME}/.local/bin:${PATH}
 export PATH=${PATH}:${CACHE_PREFIX}/bin
 export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:${CACHE_PREFIX}/include
 export C_INCLUDE_PATH=${C_INCLUDE_PATH}:${CACHE_PREFIX}/include
 export LIBRARY_PATH=${LIBRARY_PATH}:${CACHE_PREFIX}/lib
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${CACHE_PREFIX}/lib
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${CACHE_PREFIX}/lib
 alias make="make -j4"
 # setup the cache prefix folder
 if [ ! -d ${HOME}/.cache ]; then
    mkdir ${HOME}/.cache
 fi
 if [ ! -d ${CACHE_PREFIX} ]; then
    mkdir ${CACHE_PREFIX}
 fi
 if [ ! -d ${CACHE_PREFIX}/include ]; then
    mkdir ${CACHE_PREFIX}/include
 fi
 if [ ! -d ${CACHE_PREFIX}/lib ]; then
    mkdir ${CACHE_PREFIX}/lib
 fi
 if [ ! -d ${CACHE_PREFIX}/bin ]; then
    mkdir ${CACHE_PREFIX}/bin
 fi
 # setup CUDA path if NVCC_PREFIX exists
 if [ ! -z "$NVCC_PREFIX" ]; then
    export PATH=${PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/bin
    export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/include
    export C_INCLUDE_PATH=${C_INCLUDE_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/include
    export LIBRARY_PATH=${LIBRARY_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/lib64:${NVCC_PREFIX}/usr/lib/x86_64-linux-gnu
    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${NVCC_PREFIX}/usr/local/cuda-7.5/lib64:${NVCC_PREFIX}/usr/lib/x86_64-linux-gnu
 fi
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -275,7 +275,7 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) {
  }
  try {
    utils::TCPSocket tracker = this->ConnectTracker();
-    fprintf(stdout, "task %s connected to the tracker\n", task_id.c_str());
+    LOG(INFO) << "task " << task_id << " connected to the tracker";
    tracker.SendStr(std::string(cmd));
    // the rank of previous link, next link in ring
--- a/rabit/src/allreduce_base.h
+++ b/rabit/src/allreduce_base.h
@@ -98,14 +98,9 @@ class AllreduceBase : public IEngine {
  * \param slice_begin beginning of the current slice
  * \param slice_end end of the current slice
  * \param size_prev_slice size of the previous slice i.e. slice of node (rank - 1) % world_size
  * \param _file caller file name used to generate unique cache key
  * \param _line caller line number used to generate unique cache key
  * \param _caller caller function name used to generate unique cache key
  */
  void Allgather(void *sendrecvbuf_, size_t total_size, size_t slice_begin,
-                 size_t slice_end, size_t size_prev_slice,
+                 size_t slice_end, size_t size_prev_slice) override {
                 const char *_file = _FILE, const int _line = _LINE,
                 const char *_caller = _CALLER) override {
    if (world_size == 1 || world_size == -1) {
      return;
    }
@@ -124,15 +119,10 @@ class AllreduceBase : public IEngine {
   *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
   * \param prepare_arg argument used to passed into the lazy preprocessing function
   * \param _file caller file name used to generate unique cache key
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
  void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count,
                 ReduceFunction reducer, PreprocFunction prepare_fun = nullptr,
-                 void *prepare_arg = nullptr, const char *_file = _FILE,
+                 void *prepare_arg = nullptr) override {
                 const int _line = _LINE,
                 const char *_caller = _CALLER) override {
    if (prepare_fun != nullptr) prepare_fun(prepare_arg);
    if (world_size == 1 || world_size == -1) return;
    utils::Assert(TryAllreduce(sendrecvbuf_, type_nbytes, count, reducer) ==
@@ -148,9 +138,7 @@ class AllreduceBase : public IEngine {
   * \param _line caller line number used to generate unique cache key
   * \param _caller caller function name used to generate unique cache key
   */
-  void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
+  void Broadcast(void *sendrecvbuf_, size_t total_size, int root) override {
                 const char *_file = _FILE, const int _line = _LINE,
                 const char *_caller = _CALLER) override {
    if (world_size == 1 || world_size == -1) return;
    utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
                  "Broadcast failed");
@@ -232,14 +220,6 @@ class AllreduceBase : public IEngine {
  int VersionNumber() const override {
    return version_number;
  }
  /*!
   * \brief explicitly re-init everything before calling LoadCheckPoint
   *    call this function when IEngine throw an exception out,
   *    this function is only used for test purpose
   */
  void InitAfterException() override {
    utils::Error("InitAfterException: not implemented");
  }
  /*!
   * \brief report current status to the job tracker
   * depending on the job tracker we are in
--- a/rabit/src/allreduce_mock.h
+++ b/rabit/src/allreduce_mock.h
@@ -11,8 +11,8 @@
 #include <vector>
 #include <map>
 #include <sstream>
 #include <dmlc/timer.h>
 #include "rabit/internal/engine.h"
 #include "rabit/internal/timer.h"
 #include "allreduce_base.h"
 namespace rabit {
@@ -46,36 +46,30 @@ class AllreduceMock : public AllreduceBase {
  }
  void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count,
                 ReduceFunction reducer, PreprocFunction prepare_fun,
-                 void *prepare_arg, const char *_file = _FILE,
+                 void *prepare_arg) override {
                 const int _line = _LINE,
                 const char *_caller = _CALLER) override {
    this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "AllReduce");
-    double tstart = utils::GetTime();
+    double tstart = dmlc::GetTime();
    AllreduceBase::Allreduce(sendrecvbuf_, type_nbytes, count, reducer,
-                             prepare_fun, prepare_arg, _file, _line, _caller);
+                             prepare_fun, prepare_arg);
-    tsum_allreduce_ += utils::GetTime() - tstart;
+    tsum_allreduce_ += dmlc::GetTime() - tstart;
  }
  void Allgather(void *sendrecvbuf, size_t total_size, size_t slice_begin,
-                 size_t slice_end, size_t size_prev_slice,
+                 size_t slice_end, size_t size_prev_slice) override {
                 const char *_file = _FILE, const int _line = _LINE,
                 const char *_caller = _CALLER) override {
    this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "Allgather");
-    double tstart = utils::GetTime();
+    double tstart = dmlc::GetTime();
    AllreduceBase::Allgather(sendrecvbuf, total_size, slice_begin, slice_end,
-                             size_prev_slice, _file, _line, _caller);
+                             size_prev_slice);
-    tsum_allgather_ += utils::GetTime() - tstart;
+    tsum_allgather_ += dmlc::GetTime() - tstart;
  }
-  void Broadcast(void *sendrecvbuf_, size_t total_size, int root,
+  void Broadcast(void *sendrecvbuf_, size_t total_size, int root) override {
                 const char *_file = _FILE, const int _line = _LINE,
                 const char *_caller = _CALLER) override {
    this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "Broadcast");
-    AllreduceBase::Broadcast(sendrecvbuf_, total_size, root, _file, _line, _caller);
+    AllreduceBase::Broadcast(sendrecvbuf_, total_size, root);
  }
  int LoadCheckPoint(Serializable *global_model,
                     Serializable *local_model) override {
    tsum_allreduce_ = 0.0;
    tsum_allgather_ = 0.0;
-    time_checkpoint_ = utils::GetTime();
+    time_checkpoint_ = dmlc::GetTime();
    if (force_local_ == 0) {
      return AllreduceBase::LoadCheckPoint(global_model, local_model);
    } else {
@@ -87,7 +81,7 @@ class AllreduceMock : public AllreduceBase {
  void CheckPoint(const Serializable *global_model,
                  const Serializable *local_model) override {
    this->Verify(MockKey(rank, version_number, seq_counter, num_trial_), "CheckPoint");
-    double tstart = utils::GetTime();
+    double tstart = dmlc::GetTime();
    double tbet_chkpt = tstart - time_checkpoint_;
    if (force_local_ == 0) {
      AllreduceBase::CheckPoint(global_model, local_model);
@@ -96,8 +90,8 @@ class AllreduceMock : public AllreduceBase {
      ComboSerializer com(global_model, local_model);
      AllreduceBase::CheckPoint(&dum, &com);
    }
-    time_checkpoint_ = utils::GetTime();
+    time_checkpoint_ = dmlc::GetTime();
-    double tcost = utils::GetTime() - tstart;
+    double tcost = dmlc::GetTime() - tstart;
    if (report_stats_ != 0 && rank == 0) {
      std::stringstream ss;
      ss << "[v" << version_number << "] global_size="
--- a/rabit/src/c_api.cc
+++ b/rabit/src/c_api.cc
@@ -228,12 +228,12 @@ RABIT_DLL bool RabitInit(int argc, char *argv[]) {
  return ret;
 }
-RABIT_DLL bool RabitFinalize() {
+RABIT_DLL int RabitFinalize() {
  auto ret = rabit::Finalize();
  if (!ret) {
    XGBAPISetLastError("Failed to shutdown RABIT worker.");
  }
-  return ret;
+  return static_cast<int>(ret);
 }
 RABIT_DLL int RabitGetRingPrevRank() {
@@ -270,37 +270,39 @@ RABIT_DLL void RabitGetProcessorName(char *out_name,
  *out_len = static_cast<rbt_ulong>(s.length());
 }
-RABIT_DLL void RabitBroadcast(void *sendrecv_data,
+RABIT_DLL int RabitBroadcast(void *sendrecv_data,
                              rbt_ulong size, int root) {
  API_BEGIN()
  rabit::Broadcast(sendrecv_data, size, root);
  API_END()
 }
-RABIT_DLL void RabitAllgather(void *sendrecvbuf_, size_t total_size,
+RABIT_DLL int RabitAllgather(void *sendrecvbuf_, size_t total_size,
                              size_t beginIndex, size_t size_node_slice,
                              size_t size_prev_slice, int enum_dtype) {
-  rabit::c_api::Allgather(sendrecvbuf_,
+  API_BEGIN()
-                          total_size,
+  rabit::c_api::Allgather(
-                          beginIndex,
+      sendrecvbuf_, total_size, beginIndex, size_node_slice, size_prev_slice,
-                          size_node_slice,
+      static_cast<rabit::engine::mpi::DataType>(enum_dtype));
-                          size_prev_slice,
+  API_END()
                          static_cast<rabit::engine::mpi::DataType>(enum_dtype));
 }
-RABIT_DLL void RabitAllreduce(void *sendrecvbuf, size_t count, int enum_dtype,
+RABIT_DLL int RabitAllreduce(void *sendrecvbuf, size_t count, int enum_dtype,
                              int enum_op, void (*prepare_fun)(void *arg),
                              void *prepare_arg) {
-  rabit::c_api::Allreduce
+  API_BEGIN()
-      (sendrecvbuf, count,
+  rabit::c_api::Allreduce(sendrecvbuf, count,
-       static_cast<rabit::engine::mpi::DataType>(enum_dtype),
+                          static_cast<rabit::engine::mpi::DataType>(enum_dtype),
-       static_cast<rabit::engine::mpi::OpType>(enum_op),
+                          static_cast<rabit::engine::mpi::OpType>(enum_op),
-       prepare_fun, prepare_arg);
+                          prepare_fun, prepare_arg);
  API_END()
 }
 RABIT_DLL int RabitLoadCheckPoint(char **out_global_model,
                                  rbt_ulong *out_global_len,
                                  char **out_local_model,
                                  rbt_ulong *out_local_len) {
-  // NOTE: this function is not thread-safe
+  // no-op as XGBoost 1.3
  using rabit::BeginPtr;
  using namespace rabit::c_api; // NOLINT(*)
  static std::string global_buffer;
--- a/rabit/src/engine.cc
+++ b/rabit/src/engine.cc
@@ -85,12 +85,9 @@ IEngine *GetEngine() {
 void Allgather(void *sendrecvbuf_, size_t total_size,
                   size_t slice_begin,
                   size_t slice_end,
-                   size_t size_prev_slice,
+                   size_t size_prev_slice) {
                   const char* _file,
                   const int _line,
                   const char* _caller) {
  GetEngine()->Allgather(sendrecvbuf_, total_size, slice_begin,
-    slice_end, size_prev_slice, _file, _line, _caller);
+    slice_end, size_prev_slice);
 }
@@ -102,12 +99,9 @@ void Allreduce_(void *sendrecvbuf,  // NOLINT
                mpi::DataType dtype,
                mpi::OpType op,
                IEngine::PreprocFunction prepare_fun,
-                void *prepare_arg,
+                void *prepare_arg) {
                const char* _file,
                const int _line,
                const char* _caller) {
  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count, red, prepare_fun,
-    prepare_arg, _file, _line, _caller);
+    prepare_arg);
 }
 // code for reduce handle
@@ -126,14 +120,10 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
 void ReduceHandle::Allreduce(void *sendrecvbuf,
                             size_t type_nbytes, size_t count,
                             IEngine::PreprocFunction prepare_fun,
-                             void *prepare_arg,
+                             void *prepare_arg) {
                             const char* _file,
                             const int _line,
                             const char* _caller) {
  utils::Assert(redfunc_ != nullptr, "must intialize handle to call AllReduce");
  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
-                         redfunc_, prepare_fun, prepare_arg,
+                         redfunc_, prepare_fun, prepare_arg);
                         _file, _line, _caller);
 }
 }  // namespace engine
 }  // namespace rabit
--- a/rabit/test/.gitignore
+++ b/rabit/test/.gitignore
@@ -1,3 +0,0 @@
 *.mpi
 *_test
 *_recover
--- a/rabit/test/Makefile
+++ b/rabit/test/Makefile
@@ -1,77 +0,0 @@
 RABIT_BUILD_DMLC = 0
 ifeq ($(RABIT_BUILD_DMLC),1)
    DMLC=../dmlc-core
 else
    DMLC=../../dmlc-core
 endif
 MPICXX=../mpich/bin/mpicxx
 export LDFLAGS=  -L../lib -pthread -lm
 export CFLAGS = -Wall -O3 -Wno-unknown-pragmas
 export CC = gcc
 export CXX = g++
 #----------------------------
 # Settings for power and arm arch
 #----------------------------
 ARCH := $(shell uname -a)
 ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
 	CFLAGS += -march=native
 else
 	CFLAGS += -msse2
 endif
 ifndef WITH_FPIC
 	WITH_FPIC = 1
 endif
 ifeq ($(WITH_FPIC), 1)
 	CFLAGS += -fPIC
 endif
 CFLAGS += -I../include -I $(DMLC)/include -std=c++11
 # specify tensor path
 BIN = speed_test model_recover local_recover lazy_recover
 OBJ = $(RABIT_OBJ) speed_test.o model_recover.o local_recover.o lazy_recover.o
 MPIBIN = speed_test.mpi
 .PHONY: clean all lib mpi
 .PHONY: lib all
 all: $(BIN)
 lib:
 	cd ..;make clean;make;cd -
 .PHONY: mpi
 mpi:
 	cd ..;make mpi;cd -
 # programs
 speed_test.o: speed_test.cc ../include/rabit/*.h lib mpi
 model_recover.o: model_recover.cc ../include/rabit/*.h lib
 local_recover.o: local_recover.cc ../include/rabit/*.h lib
 lazy_recover.o: lazy_recover.cc ../include/rabit/*.h lib
 # we can link against MPI version to get use MPI
 speed_test: speed_test.o  $(RABIT_OBJ)
 speed_test.mpi: speed_test.o $(MPIOBJ)
 model_recover: model_recover.o  $(RABIT_OBJ)
 local_recover: local_recover.o  $(RABIT_OBJ)
 lazy_recover: lazy_recover.o  $(RABIT_OBJ)
 $(BIN) :
 	$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc, $^) ../lib/librabit_mock.a $(LDFLAGS)
 $(OBJ) :
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
 $(MPIBIN) :
 	$(MPICXX) $(CFLAGS) -I../mpich/include -shared -o $@ $(filter %.cpp %.o %.c %.cc, $^) \
    ../lib/librabit_mpi.so $(LDFLAGS) -L../mpich/lib -Wl,-rpath,../mpich/lib -lmpi
 clean:
 	$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) *~ ../src/*~
--- a/rabit/test/README.md
+++ b/rabit/test/README.md
@@ -1,18 +0,0 @@
 Testcases of Rabit
 ====
 This folder contains internal testcases to test correctness and efficiency of rabit API
 The example running scripts for testcases are given by test.mk
 * type ```make -f test.mk testcasename``` to run certain testcase
 Helper Scripts
 ====
 * test.mk contains Makefile documentation of all testcases
 * keepalive.sh helper bash to restart a program when it dies abnormally
 List of Programs
 ====
 * speed_test: test the running speed of rabit API
 * test_local_recover: test recovery of local state when error happens
 * test_model_recover: test recovery of global state when error happens
--- a/rabit/test/lazy_recover.cc
+++ b/rabit/test/lazy_recover.cc
@@ -1,125 +0,0 @@
 // this is a test case to test whether rabit can recover model when
 // facing an exception
 #include <rabit/rabit.h>
 #include <cstdio>
 #include <cstdlib>
 #include <cmath>
 using namespace rabit;
 // dummy model
 class Model : public rabit::Serializable {
 public:
  // iterations
  std::vector<float> data;
  // load from stream
  virtual void Load(rabit::Stream *fi) {
    fi->Read(&data);
  }
  /*! \brief save the model to the stream */
  virtual void Save(rabit::Stream *fo) const {
    fo->Write(data);
  }
  virtual void InitModel(size_t n) {
    data.clear();
    data.resize(n, 1.0f);
  }
 };
 inline void TestMax(Model *model, int ntrial, int iter) {
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  const int z = iter + 111;
  std::vector<float> ndata(model->data.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    ndata[i] = (i * (rank+1)) % z  + model->data[i];
  }
  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    float rmax = (i * 1) % z + model->data[i];
    for (int r = 0; r < nproc; ++r) {
      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
    }
    utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
  }
 }
 inline void TestSum(Model *model, int ntrial, int iter) {
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  const int z = 131 + iter;
  std::vector<float> ndata(model->data.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    ndata[i] = (i * (rank+1)) % z + model->data[i];
  }
  Allreduce<op::Sum>(&ndata[0], ndata.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    float rsum = model->data[i] * nproc;
    for (int r = 0; r < nproc; ++r) {
      rsum += (float)((i * (r+1)) % z);
    }
    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
  }
  model->data = ndata;
 }
 inline void TestBcast(size_t n, int root, int ntrial, int iter) {
  int rank = rabit::GetRank();
  std::string s; s.resize(n);
  for (size_t i = 0; i < n; ++i) {
    s[i] = char(i % 126 + 1);
  }
  std::string res;
  if (root == rank) {
    res = s;
    rabit::Broadcast(&res, root);
  } else {
    rabit::Broadcast(&res, root);
  }
  utils::Check(res == s, "[%d] TestBcast fail", rank);
 }
 int main(int argc, char *argv[]) {
  if (argc < 3) {
    printf("Usage: <ndata> <config>\n");
    return 0;
  }
  int n = atoi(argv[1]);
  rabit::Init(argc, argv);
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  std::string name = rabit::GetProcessorName();
  Model model;
  srand(0);
  int ntrial = 0;
  for (int i = 1; i < argc; ++i) {
    int n;
    if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n;
  }
  int iter = rabit::LoadCheckPoint(&model);
  if (iter == 0) {
    model.InitModel(n);
    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
  } else {
    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
  }
  for (int r = iter; r < 3; ++r) {
    TestMax(&model, ntrial, r);
    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);
    int step = std::max(nproc / 3, 1);
    for (int i = 0; i < nproc; i += step) {
      TestBcast(n, i, ntrial, r);
    }
    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
    TestSum(&model, ntrial, r);
    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
    rabit::LazyCheckPoint(&model);
    printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
  }
  rabit::Finalize();
  return 0;
 }
--- a/rabit/test/local_recover.cc
+++ b/rabit/test/local_recover.cc
@@ -1,137 +0,0 @@
 // this is a test case to test whether rabit can recover model when
 // facing an exception
 #include <rabit/rabit.h>
 #include <cstdio>
 #include <cstdlib>
 #include <cmath>
 using namespace rabit;
 // dummy model
 class Model : public rabit::Serializable {
 public:
  // iterations
  std::vector<float> data;
  // load from stream
  virtual void Load(rabit::Stream *fi) {
    fi->Read(&data);
  }
  /*! \brief save the model to the stream */
  virtual void Save(rabit::Stream *fo) const {
    fo->Write(data);
  }
  virtual void InitModel(size_t n, float v) {
    data.clear();
    data.resize(n, v);
  }
 };
 inline void TestMax(Model *model, Model *local, int ntrial, int iter) {
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  const int z = iter + 111;
  std::vector<float> ndata(model->data.size());
  rabit::Allreduce<op::Max>(&ndata[0], ndata.size(),
                            [&]() {
                              // use lambda expression to prepare the data
                              for (size_t i = 0; i < ndata.size(); ++i) {
                                ndata[i] = (i * (rank+1)) % z  + local->data[i];
                              }
                            });
  for (size_t i = 0; i < ndata.size(); ++i) {
    float rmax = (i * 1) % z + model->data[i];
    for (int r = 0; r < nproc; ++r) {
      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i] + r);
    }
    utils::Check(rmax == ndata[i], "[%d] TestMax check failure", rank);
  }
  model->data = ndata;
  local->data = ndata;
  for (size_t i = 0; i < ndata.size(); ++i) {
    local->data[i] = ndata[i] + rank;
  }
 }
 inline void TestSum(Model *model, Model *local, int ntrial, int iter) {
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  const int z = 131 + iter;
  std::vector<float> ndata(model->data.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    ndata[i] = (i * (rank+1)) % z + local->data[i];
  }
  Allreduce<op::Sum>(&ndata[0], ndata.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    float rsum = 0.0f;
    for (int r = 0; r < nproc; ++r) {
      rsum += (float)((i * (r+1)) % z) + model->data[i] + r;
    }
    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
  }
  model->data = ndata;
  for (size_t i = 0; i < ndata.size(); ++i) {
    local->data[i] = ndata[i] + rank;
  }
 }
 inline void TestBcast(size_t n, int root, int ntrial, int iter) {
  int rank = rabit::GetRank();
  std::string s; s.resize(n);
  for (size_t i = 0; i < n; ++i) {
    s[i] = char(i % 126 + 1);
  }
  std::string res;
  if (root == rank) {
    res = s;
    rabit::Broadcast(&res, root);
  } else {
    rabit::Broadcast(&res, root);
  }
  utils::Check(res == s, "[%d] TestBcast fail", rank);
 }
 int main(int argc, char *argv[]) {
  if (argc < 3) {
    printf("Usage: <ndata>\n");
    return 0;
  }
  int n = atoi(argv[1]);
  rabit::Init(argc, argv);
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  std::string name = rabit::GetProcessorName();
  Model model, local;
  srand(0);
  int ntrial = 0;
  for (int i = 1; i < argc; ++i) {
    int n;
    if (sscanf(argv[i], "repeat=%d", &n) == 1) ntrial = n;
  }
  int iter = rabit::LoadCheckPoint(&model, &local);
  if (iter == 0) {
    model.InitModel(n, 1.0f);
    local.InitModel(n, 1.0f + rank);
    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
  } else {
    printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
  }
  for (int r = iter; r < 3; ++r) {
    TestMax(&model, &local, ntrial, r);
    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);
    int step = std::max(nproc / 3, 1);
    for (int i = 0; i < nproc; i += step) {
      TestBcast(n, i, ntrial, r);
    }
    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
    TestSum(&model, &local, ntrial, r);
    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
    rabit::CheckPoint(&model, &local);
    printf("[%d] !!!CheckPoint pass, iter=%d\n", rank, r);
  }
  rabit::Finalize();
  return 0;
 }
--- a/rabit/test/local_recover.py
+++ b/rabit/test/local_recover.py
@@ -1,32 +0,0 @@
 #!/usr/bin/env python3
 from __future__ import print_function
 from builtins import range
 import sys
 sys.path.append('../python')
 import rabit
 import numpy as np
 rabit.init(lib='mock')
 rank = rabit.get_rank()
 n = 10
 nround = 3
 data = np.ones(n) * rank
 version, model, local = rabit.load_checkpoint(True)
 if version == 0:
    model = np.zeros(n)
    local = np.ones(n)
 else:
    print('[%d] restart from version %d' % (rank, version))
 for i in range(version, nround):
    res = rabit.allreduce(data + model+local, rabit.SUM)
    print('[%d] iter=%d: %s' % (rank, i, str(res)))
    model = res
    local[:] = i
    rabit.checkpoint(model, local)
 rabit.finalize()
--- a/rabit/test/model_recover.cc
+++ b/rabit/test/model_recover.cc
@@ -1,157 +0,0 @@
 // this is a test case to test whether rabit can recover model when
 // facing an exception
 #include <rabit/rabit.h>
 #include <cstdio>
 #include <cstdlib>
 #include <cmath>
 using namespace rabit;
 // dummy model
 class Model : public rabit::Serializable {
 public:
  // iterations
  std::vector<float> data;
  // load from stream
  virtual void Load(rabit::Stream *fi) {
    fi->Read(&data);
  }
  /*! \brief save the model to the stream */
  virtual void Save(rabit::Stream *fo) const {
    fo->Write(data);
  }
  virtual void InitModel(size_t n) {
    data.clear();
    data.resize(n, 1.0f);
  }
 };
 inline void TestMax(Model *model, int iter) {
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  const int z = iter + 111;
  std::vector<float> ndata(model->data.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    ndata[i] = (i * (rank+1)) % z  + model->data[i];
  }
  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    float rmax = (i * 1) % z + model->data[i];
    for (int r = 0; r < nproc; ++r) {
      rmax = std::max(rmax, (float)((i * (r+1)) % z) + model->data[i]);
    }
    utils::Check(rmax == ndata[i], "[%d] TestMax check failurem i=%lu, rmax=%f, ndata=%f", rank, i, rmax, ndata[i]);
  }
  model->data = ndata;
 }
 inline void TestSum(Model *model, int iter) {
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  const int z = 131 + iter;
  std::vector<float> ndata(model->data.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    ndata[i] = (i * (rank+1)) % z + model->data[i];
  }
  Allreduce<op::Sum>(&ndata[0], ndata.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    float rsum = model->data[i] * nproc;
    for (int r = 0; r < nproc; ++r) {
      rsum += (float)((i * (r+1)) % z);
    }
    utils::Check(fabsf(rsum - ndata[i]) < 1e-5 ,
                 "[%d] TestSum check failure, local=%g, allreduce=%g", rank, rsum, ndata[i]);
  }
  model->data = ndata;
 }
 inline void TestAllgather(Model *model, int iter) {
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  const int z = 131 + iter;
  std::vector<float> ndata(model->data.size() * nproc);
  size_t beginSlice = rank * model->data.size();
  for (size_t i = 0; i < model->data.size(); ++i) {
    ndata[beginSlice + i] = (i * (rank+1)) % z + model->data[i];
  }
  Allgather(&ndata[0], ndata.size(), beginSlice,
  model->data.size(), model->data.size());
  for (size_t i = 0; i < ndata.size(); ++i) {
    int curRank = i / model->data.size();
    int remainder = i % model->data.size();
    float data = (remainder * (curRank+1)) % z + model->data[remainder];
    utils::Check(fabsf(data - ndata[i]) < 1e-5 ,
                 "[%d] TestAllgather check failure, local=%g, allgatherring=%g", rank, data, ndata[i]);
  }
  model->data = ndata;
 }
 inline void TestBcast(size_t n, int root) {
  int rank = rabit::GetRank();
  std::string s; s.resize(n);
  for (size_t i = 0; i < n; ++i) {
    s[i] = char(i % 126 + 1);
  }
  std::string res;
  if (root == rank) {
    res = s;
  }
  rabit::Broadcast(&res, root);
  utils::Check(res == s, "[%d] TestBcast fail", rank);
 }
 int main(int argc, char *argv[]) {
  if (argc < 3) {
    printf("Usage: <ndata> <config>\n");
    return 0;
  }
  int n = atoi(argv[1]);
  rabit::Init(argc, argv);
  int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  std::string name = rabit::GetProcessorName();
  int max_rank = rank;
  rabit::Allreduce<op::Max>(&max_rank, 1);
  utils::Check(max_rank == nproc - 1, "max rank is world size-1");
  Model model;
  srand(0);
  int ntrial = 0;
  for (int i = 1; i < argc; ++i) {
    int n;
    if (sscanf(argv[i], "rabit_num_trial=%d", &n) == 1) ntrial = n;
  }
  int iter = rabit::LoadCheckPoint(&model);
  if (iter == 0) {
    model.InitModel(n);
  }
  printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
  for (int r = iter; r < 3; ++r) {
    TestMax(&model, r);
    printf("[%d] !!!TestMax pass, iter=%d\n",  rank, r);
    int step = std::max(nproc / 3, 1);
    for (int i = 0; i < nproc; i += step) {
      TestBcast(n, i);
    }
    printf("[%d] !!!TestBcast pass, iter=%d\n", rank, r);
    TestSum(&model, r);
    printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
    TestAllgather(&model, r);
    printf("[%d] !!!TestAllgather pass, iter=%d\n", rank, r);
    rabit::CheckPoint(&model);
    printf("[%d] !!!Checkpoint pass, iter=%d\n", rank, r);
  }
  rabit::Finalize();
  return 0;
 }
--- a/rabit/test/speed_runner.py
+++ b/rabit/test/speed_runner.py
@@ -1,34 +0,0 @@
 import os
 import argparse
 import sys
 def main():
  parser = argparse.ArgumentParser(description='TODO')
  parser.add_argument('-ho', '--host_dir', required=True)
  parser.add_argument('-s', '--submit_script', required=True)
  parser.add_argument('-rex', '--rabit_exec', required=True)
  parser.add_argument('-mpi', '--mpi_exec', required=True)
  args = parser.parse_args()
  ndata = [10**4, 10**5, 10**6, 10**7]
  nrepeat = [10**4, 10**3, 10**2, 10]
  machines = [2,4,8,16,31]
  executables = [args.rabit_exec, args.mpi_exec]
  for executable in executables:
    sys.stderr.write('Executable %s' % executable)
    sys.stderr.flush()
    for i, data in enumerate(ndata):
      for machine in machines:
        host_file = os.path.join(args.host_dir, 'hosts%d' % machine)
        cmd = 'python %s %d %s %s %d %d' % (args.submit_script, machine, host_file, executable, data, nrepeat[i])
        sys.stderr.write('data=%d, repeat=%d, machine=%d\n' % (data, nrepeat[i], machine))
        sys.stderr.flush()
        os.system(cmd)
    sys.stderr.write('\n')
    sys.stderr.flush()
 if __name__ == "__main__":
  main()
--- a/rabit/test/speed_test.cc
+++ b/rabit/test/speed_test.cc
@@ -1,99 +0,0 @@
 // This program is used to test the speed of rabit API
 #include <rabit/rabit.h>
 #include <rabit/internal/timer.h>
 #include <cstdio>
 #include <cstdlib>
 #include <cmath>
 #include <time.h>
 using namespace rabit;
 double max_tdiff, sum_tdiff, bcast_tdiff, tot_tdiff;
 inline void TestMax(size_t n) {
  int rank = rabit::GetRank();
  std::vector<float> ndata(n);
  for (size_t i = 0; i < ndata.size(); ++i) {
    ndata[i] = (i * (rank+1)) % 111;
  }
  double tstart = utils::GetTime();
  rabit::Allreduce<op::Max>(&ndata[0], ndata.size());
  max_tdiff += utils::GetTime() - tstart;
 }
 inline void TestSum(size_t n) {
  int rank = rabit::GetRank();
  const int z = 131;
  std::vector<float> ndata(n);
  for (size_t i = 0; i < ndata.size(); ++i) {
    ndata[i] = (i * (rank+1)) % z;
  }
  double tstart = utils::GetTime();
  rabit::Allreduce<op::Sum>(&ndata[0], ndata.size());
  sum_tdiff += utils::GetTime() - tstart;
 }
 inline void TestBcast(size_t n, int root) {
  int rank = rabit::GetRank();
  std::string s; s.resize(n);
  for (size_t i = 0; i < n; ++i) {
    s[i] = char(i % 126 + 1);
  }
  std::string res;
  res.resize(n);
  if (root == rank) {
    res = s;
  }
  double tstart = utils::GetTime();
  rabit::Broadcast(&res[0], res.length(), root);
  bcast_tdiff += utils::GetTime() - tstart;
 }
 inline void PrintStats(const char *name, double tdiff, int n, int nrep, size_t size) {
  int nproc = rabit::GetWorldSize();
  double tsum = tdiff;
  rabit::Allreduce<op::Sum>(&tsum, 1);
  double tavg = tsum / nproc;
  double tsqr = tdiff - tavg;
  tsqr *= tsqr;
  rabit::Allreduce<op::Sum>(&tsqr, 1);
  double tstd = sqrt(tsqr / nproc);
  if (rabit::GetRank() == 0) {
    rabit::TrackerPrintf("%s: mean=%g, std=%g sec\n", name, tavg, tstd);
    double ndata = n;
    ndata *= nrep * size;
    if (n != 0) {
      rabit::TrackerPrintf("%s-speed: %g MB/sec\n", name, (ndata / tavg) / 1024 / 1024 );
    }
  }
 }
 int main(int argc, char *argv[]) {
  if (argc < 3) {
    printf("Usage: <ndata> <nrepeat>\n");
    return 0;
  }
  srand(0);
  int n = atoi(argv[1]);
  int nrep = atoi(argv[2]);
  utils::Check(nrep >= 1, "need to at least repeat running once");
  rabit::Init(argc, argv);
  //int rank = rabit::GetRank();
  int nproc = rabit::GetWorldSize();
  std::string name = rabit::GetProcessorName();
  max_tdiff = sum_tdiff = bcast_tdiff = 0;
  double tstart = utils::GetTime();
  for (int i = 0; i < nrep; ++i) {
    TestMax(n);
    TestSum(n);
    TestBcast(n, rand() % nproc);
  }
  tot_tdiff = utils::GetTime() - tstart;
  // use allreduce to get the sum and std of time
  PrintStats("max_tdiff", max_tdiff, n, nrep, sizeof(float));
  PrintStats("sum_tdiff", sum_tdiff, n, nrep, sizeof(float));
  PrintStats("bcast_tdiff", bcast_tdiff, n, nrep, sizeof(char));
  PrintStats("tot_tdiff", tot_tdiff, 0, nrep, sizeof(float));
  rabit::Finalize();
  return 0;
 }
--- a/rabit/test/test.mk
+++ b/rabit/test/test.mk
@@ -1,37 +0,0 @@
 RABIT_BUILD_DMLC = 0
 ifeq ($(RABIT_BUILD_DMLC),1)
    DMLC=../dmlc-core
 else
    DMLC=../../dmlc-core
 endif
 # this is a makefile used to show testcases of rabit
 .PHONY: all
 all: model_recover_10_10k  model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k lazy_recover_10_10k_die_hard lazy_recover_10_10k_die_same ringallreduce_10_10k pylocal_recover_10_10k
 # this experiment test recovery with actually process exit, use keepalive to keep program alive
 model_recover_10_10k:
 	python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 rabit_bootstrap_cache=true rabit_debug=true rabit_reduce_ring_mincount=1 rabit_timeout=true rabit_timeout_sec=5
 model_recover_10_10k_die_same:
 	python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 rabit_bootstrap_cache=1
 model_recover_10_10k_die_hard:
 	python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0 rabit_bootstrap_cache=1
 local_recover_10_10k:
 	python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
 pylocal_recover_10_10k:
 	python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
 lazy_recover_10_10k_die_hard:
 	python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
 lazy_recover_10_10k_die_same:
 	python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=20 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
 ringallreduce_10_10k:
 	python $(DMLC)/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 rabit_reduce_ring_mincount=10
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -143,7 +143,7 @@ class ColumnSampler {
  */
  ColumnSampler() {
    uint32_t seed = common::GlobalRandom()();
-    rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
+    rabit::Broadcast(&seed, sizeof(seed), 0);
    rng_.seed(seed);
  }
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -714,8 +714,7 @@ DMatrix* DMatrix::Load(const std::string& uri,
  /* sync up number of features after matrix loaded.
   * partitioned data will fail the train/val validation check
   * since partitioned data not knowing the real number of features. */
-  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
+  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
    nullptr, fname.c_str());
  // backward compatiblity code.
  if (!load_row_split) {
    MetaInfo& info = dmat->Info();
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -546,8 +546,7 @@ class LearnerConfiguration : public Learner {
        num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
      }
-      rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr,
+      rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
                                       "num_feature");
      if (num_feature > mparam_.num_feature) {
        mparam_.num_feature = num_feature;
      }