From 512f037e55525c6a320237a98a9e628a86c92227 Mon Sep 17 00:00:00 2001
From: Chen Qin <cq@uber.com>
Date: Mon, 16 Sep 2019 20:31:52 -0700
Subject: [PATCH] [rabit_bootstrap_cache ] failed xgb worker recover from other
 workers (#4808)

* Better recovery support.  Restarting only the failed workers.
---
 .travis.yml                        |  2 +-
 CMakeLists.txt                     | 23 ++++++++++++++-------
 Jenkinsfile                        | 32 ++++++++++++++++++++++++++++++
 rabit                              |  2 +-
 src/common/hist_util.cc            |  2 ++
 src/common/random.h                |  2 +-
 src/data/data.cc                   |  3 ++-
 src/learner.cc                     | 24 +++++++++++++++++++---
 tests/ci_build/approx.conf.in      | 12 +++++++++++
 tests/ci_build/build_mock_cmake.sh | 10 ++++++++++
 tests/ci_build/runxgb.sh           | 13 ++++++++++++
 11 files changed, 111 insertions(+), 14 deletions(-)
 create mode 100644 tests/ci_build/approx.conf.in
 create mode 100755 tests/ci_build/build_mock_cmake.sh
 create mode 100755 tests/ci_build/runxgb.sh

diff --git a/.travis.yml b/.travis.yml
index d3925cf03..513bff0b8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
 # disable sudo for container build.
 sudo: required
 
-# Enabling test on Linux and OS X
+# Enabling test OS X
 os:
   - osx
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92c78917c..b982a96ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF)
 option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
 option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
 set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
+option(RABIT_MOCK "Build rabit with mock" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
@@ -88,17 +89,25 @@ list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)
 
 # rabit
 # full rabit doesn't build on windows, so we can't import it as subdirectory
-if(MINGW OR R_LIB)
+if(MINGW OR R_LIB OR WIN32)
   set(RABIT_SOURCES
     rabit/src/engine_empty.cc
     rabit/src/c_api.cc)
 else ()
-  set(RABIT_SOURCES
-    rabit/src/allreduce_base.cc
-    rabit/src/allreduce_robust.cc
-    rabit/src/engine.cc
-    rabit/src/c_api.cc)
-endif (MINGW OR R_LIB)
+  if(RABIT_MOCK)
+    set(RABIT_SOURCES
+      rabit/src/allreduce_base.cc
+      rabit/src/allreduce_robust.cc
+      rabit/src/engine_mock.cc
+      rabit/src/c_api.cc)
+  else()
+    set(RABIT_SOURCES
+      rabit/src/allreduce_base.cc
+      rabit/src/allreduce_robust.cc
+      rabit/src/engine.cc
+      rabit/src/c_api.cc)
+  endif(RABIT_MOCK)
+endif (MINGW OR R_LIB OR WIN32)
 add_library(rabit STATIC ${RABIT_SOURCES})
 target_include_directories(rabit PRIVATE
   $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>
diff --git a/Jenkinsfile b/Jenkinsfile
index 2ea986372..6eab50fe6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -56,6 +56,7 @@ pipeline {
         script {
           parallel ([
             'build-cpu': { BuildCPU() },
+            'build-cpu-rabit-mock': { BuildCPUMock() },
             'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
             'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
             'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
@@ -76,6 +77,7 @@ pipeline {
             'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
             'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
             'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
+            'test-cpp-rabit': {TestCppRabit()},
             'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
             'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
             'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
@@ -185,6 +187,22 @@ def BuildCPU() {
   }
 }
 
+def BuildCPUMock() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Build CPU with rabit mock"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh
+    """
+     echo 'Stashing rabit C++ test executable (xgboost)...'
+    stash name: 'xgboost_rabit_tests', includes: 'xgboost'
+    deleteDir()
+  }
+}
+
+
 def BuildCUDA(args) {
   node('linux && cpu') {
     unstash name: 'srcs'
@@ -279,6 +297,20 @@ def TestPythonGPU(args) {
   }
 }
 
+def TestCppRabit() {
+  node(nodeReq) {
+    unstash name: 'xgboost_rabit_tests'
+    unstash name: 'srcs'
+    echo "Test C++, rabit mock on"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in
+    """
+    deleteDir()
+  }
+}
+
 def TestCppGPU(args) {
   nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
   node(nodeReq) {
diff --git a/rabit b/rabit
index dba32d54d..9a7ac85d7 160000
--- a/rabit
+++ b/rabit
@@ -1 +1 @@
-Subproject commit dba32d54d1668033356a2ad505c239411d660821
+Subproject commit 9a7ac85d7eb65b1a0b904e1fa8d5a01b910adda4
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index a842f554c..678bee018 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -303,6 +303,8 @@ void DenseCuts::Init
   }
   CHECK_EQ(summary_array.size(), in_sketchs->size());
   size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
+  // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
+  // we need to move this allreduce before loadcheckpoint call in future
   sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
   p_cuts_->min_vals_.resize(sketchs.size());
 
diff --git a/src/common/random.h b/src/common/random.h
index d81d76a82..56c6a2b7e 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -127,7 +127,7 @@ class ColumnSampler {
   */
   ColumnSampler() {
     uint32_t seed = common::GlobalRandom()();
-    rabit::Broadcast(&seed, sizeof(seed), 0);
+    rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
     rng_.seed(seed);
   }
 
diff --git a/src/data/data.cc b/src/data/data.cc
index 9303c836a..d7d58d58c 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
   /* sync up number of features after matrix loaded.
    * partitioned data will fail the train/val validation check
    * since partitioned data not knowing the real number of features. */
-  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
+  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
+    nullptr, fname.c_str());
   // backward compatiblity code.
   if (!load_row_split) {
     MetaInfo& info = dmat->Info();
diff --git a/src/learner.cc b/src/learner.cc
index 7967c081b..6d45c1e62 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -272,6 +272,9 @@ class LearnerImpl : public Learner {
             kv.second = "cpu_predictor";
             LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
           }
+          if (saved_configs_.find(saved_param) != saved_configs_.end()) {
+            cfg_[saved_param] = kv.second;
+          }
         }
       }
       attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
@@ -304,6 +307,10 @@ class LearnerImpl : public Learner {
       p_metric->Configure({cfg_.begin(), cfg_.end()});
     }
 
+    // copy dsplit from config since it will not run again during restore
+    if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
+      tparam_.dsplit = DataSplitMode::kRow;
+    }
     this->configured_ = true;
   }
 
@@ -334,8 +341,15 @@ class LearnerImpl : public Learner {
       }
     }
     {
-      // Write `predictor`, `gpu_id` parameters as extra attributes
-      for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
+      std::vector<std::string> saved_params{"predictor", "gpu_id"};
+      // check if rabit_bootstrap_cache were set to non zero before adding to checkpoint
+      if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() &&
+        (cfg_.find("rabit_bootstrap_cache"))->second != "0") {
+        std::copy(saved_configs_.begin(), saved_configs_.end(),
+                  std::back_inserter(saved_params));
+      }
+      // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
+      for (const auto& key : saved_params) {
         auto it = cfg_.find(key);
         if (it != cfg_.end()) {
           mparam.contain_extra_attrs = 1;
@@ -603,7 +617,7 @@ class LearnerImpl : public Learner {
       num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
     }
     // run allreduce on num_feature to find the maximum value
-    rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
+    rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr, "num_feature");
     if (num_feature > mparam_.num_feature) {
       mparam_.num_feature = num_feature;
     }
@@ -650,6 +664,10 @@ class LearnerImpl : public Learner {
   std::vector<std::shared_ptr<DMatrix> > cache_;
 
   common::Monitor monitor_;
+
+  /*! \brief saved config keys used to restore failed worker */
+  std::set<std::string> saved_configs_ = {"max_depth", "tree_method", "dsplit",
+    "seed", "silent", "num_round", "gamma", "min_child_weight"};
 };
 
 std::string const LearnerImpl::kEvalMetric {"eval_metric"};  // NOLINT
diff --git a/tests/ci_build/approx.conf.in b/tests/ci_build/approx.conf.in
new file mode 100644
index 000000000..b2321a79a
--- /dev/null
+++ b/tests/ci_build/approx.conf.in
@@ -0,0 +1,12 @@
+# Originally an example in demo/regression/
+tree_method=approx
+eta = 0.5 
+gamma = 1.0
+seed = 0
+min_child_weight = 0
+max_depth = 5 
+
+num_round = 12 
+save_period = 100 
+data = "demo/data/agaricus.txt.train"
+eval[test] = "demo/data/agaricus.txt.test"
diff --git a/tests/ci_build/build_mock_cmake.sh b/tests/ci_build/build_mock_cmake.sh
new file mode 100755
index 000000000..8cbabd036
--- /dev/null
+++ b/tests/ci_build/build_mock_cmake.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -e
+
+rm -rf build
+mkdir build
+cd build
+cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
+make clean
+make -j$(nproc)
+cd ..
diff --git a/tests/ci_build/runxgb.sh b/tests/ci_build/runxgb.sh
new file mode 100755
index 000000000..2ed450dfa
--- /dev/null
+++ b/tests/ci_build/runxgb.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+#run make in rabit/test to generate librabit_mock
+#update config.mk and build xgboost using mock
+export DMLC_SUBMIT_CLUSTER=local
+
+submit="python3 dmlc-core/tracker/dmlc-submit"
+# build xgboost with librabit mock
+# define max worker retry with dmlc-core local num atempt
+# instrument worker failure with mock=xxxx
+# check if host recovered from expectected iteration
+echo "====== 1. Fault recovery distributed test ======"
+exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1