[rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808)

* Better recovery support. Restarting only the failed workers.
2019-09-16 20:31:52 -07:00 · 2019-09-16 20:31:52 -07:00 · 512f037e55
commit 512f037e55
parent c89bcc4de5
11 changed files with 111 additions and 14 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,7 +1,7 @@
 # disable sudo for container build.
 sudo: required

-# Enabling test on Linux and OS X
+# Enabling test OS X
 os:
  - osx

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF)
 option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
 option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
 set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
+option(RABIT_MOCK "Build rabit with mock" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
@ -88,17 +89,25 @@ list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)

 # rabit
 # full rabit doesn't build on windows, so we can't import it as subdirectory
-if(MINGW OR R_LIB)
+if(MINGW OR R_LIB OR WIN32)
  set(RABIT_SOURCES
    rabit/src/engine_empty.cc
    rabit/src/c_api.cc)
+else ()
+  if(RABIT_MOCK)
+    set(RABIT_SOURCES
+      rabit/src/allreduce_base.cc
+      rabit/src/allreduce_robust.cc
+      rabit/src/engine_mock.cc
+      rabit/src/c_api.cc)
  else()
    set(RABIT_SOURCES
      rabit/src/allreduce_base.cc
      rabit/src/allreduce_robust.cc
      rabit/src/engine.cc
      rabit/src/c_api.cc)
-endif (MINGW OR R_LIB)
+  endif(RABIT_MOCK)
+endif (MINGW OR R_LIB OR WIN32)
 add_library(rabit STATIC ${RABIT_SOURCES})
 target_include_directories(rabit PRIVATE
  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>
--- a/32
+++ b/32
@ -56,6 +56,7 @@ pipeline {
        script {
          parallel ([
            'build-cpu': { BuildCPU() },
+            'build-cpu-rabit-mock': { BuildCPUMock() },
            'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
            'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
@ -76,6 +77,7 @@ pipeline {
            'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
            'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
            'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
+            'test-cpp-rabit': {TestCppRabit()},
            'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
            'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
            'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
@ -185,6 +187,22 @@ def BuildCPU() {
  }
 }

+def BuildCPUMock() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Build CPU with rabit mock"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh
+    """
+     echo 'Stashing rabit C++ test executable (xgboost)...'
+    stash name: 'xgboost_rabit_tests', includes: 'xgboost'
+    deleteDir()
+  }
+}
+
+
 def BuildCUDA(args) {
  node('linux && cpu') {
    unstash name: 'srcs'
@ -279,6 +297,20 @@ def TestPythonGPU(args) {
  }
 }

+def TestCppRabit() {
+  node(nodeReq) {
+    unstash name: 'xgboost_rabit_tests'
+    unstash name: 'srcs'
+    echo "Test C++, rabit mock on"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in
+    """
+    deleteDir()
+  }
+}
+
 def TestCppGPU(args) {
  nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
  node(nodeReq) {
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit dba32d54d1668033356a2ad505c239411d660821
+Subproject commit 9a7ac85d7eb65b1a0b904e1fa8d5a01b910adda4
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -303,6 +303,8 @@ void DenseCuts::Init
  }
  CHECK_EQ(summary_array.size(), in_sketchs->size());
  size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
+  // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
+  // we need to move this allreduce before loadcheckpoint call in future
  sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
  p_cuts_->min_vals_.resize(sketchs.size());

--- a/src/common/random.h
+++ b/src/common/random.h
@ -127,7 +127,7 @@ class ColumnSampler {
  */
  ColumnSampler() {
    uint32_t seed = common::GlobalRandom()();
-    rabit::Broadcast(&seed, sizeof(seed), 0);
+    rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
    rng_.seed(seed);
  }

--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
  /* sync up number of features after matrix loaded.
   * partitioned data will fail the train/val validation check
   * since partitioned data not knowing the real number of features. */
-  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
+  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
+    nullptr, fname.c_str());
  // backward compatiblity code.
  if (!load_row_split) {
    MetaInfo& info = dmat->Info();
--- a/src/learner.cc
+++ b/src/learner.cc
@ -272,6 +272,9 @@ class LearnerImpl : public Learner {
            kv.second = "cpu_predictor";
            LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
          }
+          if (saved_configs_.find(saved_param) != saved_configs_.end()) {
+            cfg_[saved_param] = kv.second;
+          }
        }
      }
      attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
@ -304,6 +307,10 @@ class LearnerImpl : public Learner {
      p_metric->Configure({cfg_.begin(), cfg_.end()});
    }

+    // copy dsplit from config since it will not run again during restore
+    if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
+      tparam_.dsplit = DataSplitMode::kRow;
+    }
    this->configured_ = true;
  }

@ -334,8 +341,15 @@ class LearnerImpl : public Learner {
      }
    }
    {
-      // Write `predictor`, `gpu_id` parameters as extra attributes
-      for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
+      std::vector<std::string> saved_params{"predictor", "gpu_id"};
+      // check if rabit_bootstrap_cache were set to non zero before adding to checkpoint
+      if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() &&
+        (cfg_.find("rabit_bootstrap_cache"))->second != "0") {
+        std::copy(saved_configs_.begin(), saved_configs_.end(),
+                  std::back_inserter(saved_params));
+      }
+      // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
+      for (const auto& key : saved_params) {
        auto it = cfg_.find(key);
        if (it != cfg_.end()) {
          mparam.contain_extra_attrs = 1;
@ -603,7 +617,7 @@ class LearnerImpl : public Learner {
      num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
    }
    // run allreduce on num_feature to find the maximum value
-    rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
+    rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr, "num_feature");
    if (num_feature > mparam_.num_feature) {
      mparam_.num_feature = num_feature;
    }
@ -650,6 +664,10 @@ class LearnerImpl : public Learner {
  std::vector<std::shared_ptr<DMatrix> > cache_;

  common::Monitor monitor_;
+
+  /*! \brief saved config keys used to restore failed worker */
+  std::set<std::string> saved_configs_ = {"max_depth", "tree_method", "dsplit",
+    "seed", "silent", "num_round", "gamma", "min_child_weight"};
 };

 std::string const LearnerImpl::kEvalMetric {"eval_metric"};  // NOLINT
--- a/tests/ci_build/approx.conf.in
+++ b/tests/ci_build/approx.conf.in
@ -0,0 +1,12 @@
+# Originally an example in demo/regression/
+tree_method=approx
+eta = 0.5 
+gamma = 1.0
+seed = 0
+min_child_weight = 0
+max_depth = 5 
+
+num_round = 12 
+save_period = 100 
+data = "demo/data/agaricus.txt.train"
+eval[test] = "demo/data/agaricus.txt.test"
--- a/tests/ci_build/build_mock_cmake.sh
+++ b/tests/ci_build/build_mock_cmake.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -e
+
+rm -rf build
+mkdir build
+cd build
+cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
+make clean
+make -j$(nproc)
+cd ..
--- a/tests/ci_build/runxgb.sh
+++ b/tests/ci_build/runxgb.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+#run make in rabit/test to generate librabit_mock
+#update config.mk and build xgboost using mock
+export DMLC_SUBMIT_CLUSTER=local
+
+submit="python3 dmlc-core/tracker/dmlc-submit"
+# build xgboost with librabit mock
+# define max worker retry with dmlc-core local num atempt
+# instrument worker failure with mock=xxxx
+# check if host recovered from expectected iteration
+echo "====== 1. Fault recovery distributed test ======"
+exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1