[rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808)

* Better recovery support. Restarting only the failed workers.
2019-09-16 20:31:52 -07:00 · 2019-09-16 20:31:52 -07:00 · 512f037e55
commit 512f037e55
parent c89bcc4de5
11 changed files with 111 additions and 14 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,7 +1,7 @@
 # disable sudo for container build.
 sudo: required
-# Enabling test on Linux and OS X
+# Enabling test OS X
 os:
  - osx
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF)
 option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
 option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
 set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
 option(RABIT_MOCK "Build rabit with mock" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
@ -88,17 +89,25 @@ list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)
 # rabit
 # full rabit doesn't build on windows, so we can't import it as subdirectory
-if(MINGW OR R_LIB)
+if(MINGW OR R_LIB OR WIN32)
  set(RABIT_SOURCES
    rabit/src/engine_empty.cc
    rabit/src/c_api.cc)
 else ()
-  set(RABIT_SOURCES
+  if(RABIT_MOCK)
-    rabit/src/allreduce_base.cc
+    set(RABIT_SOURCES
-    rabit/src/allreduce_robust.cc
+      rabit/src/allreduce_base.cc
-    rabit/src/engine.cc
+      rabit/src/allreduce_robust.cc
-    rabit/src/c_api.cc)
+      rabit/src/engine_mock.cc
-endif (MINGW OR R_LIB)
+      rabit/src/c_api.cc)
  else()
    set(RABIT_SOURCES
      rabit/src/allreduce_base.cc
      rabit/src/allreduce_robust.cc
      rabit/src/engine.cc
      rabit/src/c_api.cc)
  endif(RABIT_MOCK)
 endif (MINGW OR R_LIB OR WIN32)
 add_library(rabit STATIC ${RABIT_SOURCES})
 target_include_directories(rabit PRIVATE
  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>
--- a/32
+++ b/32
@ -56,6 +56,7 @@ pipeline {
        script {
          parallel ([
            'build-cpu': { BuildCPU() },
            'build-cpu-rabit-mock': { BuildCPUMock() },
            'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
            'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
@ -76,6 +77,7 @@ pipeline {
            'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
            'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
            'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
            'test-cpp-rabit': {TestCppRabit()},
            'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
            'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
            'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
@ -185,6 +187,22 @@ def BuildCPU() {
  }
 }
 def BuildCPUMock() {
  node('linux && cpu') {
    unstash name: 'srcs'
    echo "Build CPU with rabit mock"
    def container_type = "cpu"
    def docker_binary = "docker"
    sh """
    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh
    """
     echo 'Stashing rabit C++ test executable (xgboost)...'
    stash name: 'xgboost_rabit_tests', includes: 'xgboost'
    deleteDir()
  }
 }
 def BuildCUDA(args) {
  node('linux && cpu') {
    unstash name: 'srcs'
@ -279,6 +297,20 @@ def TestPythonGPU(args) {
  }
 }
 def TestCppRabit() {
  node(nodeReq) {
    unstash name: 'xgboost_rabit_tests'
    unstash name: 'srcs'
    echo "Test C++, rabit mock on"
    def container_type = "cpu"
    def docker_binary = "docker"
    sh """
    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in
    """
    deleteDir()
  }
 }
 def TestCppGPU(args) {
  nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
  node(nodeReq) {
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit dba32d54d1668033356a2ad505c239411d660821
+Subproject commit 9a7ac85d7eb65b1a0b904e1fa8d5a01b910adda4
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -303,6 +303,8 @@ void DenseCuts::Init
  }
  CHECK_EQ(summary_array.size(), in_sketchs->size());
  size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
  // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
  // we need to move this allreduce before loadcheckpoint call in future
  sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
  p_cuts_->min_vals_.resize(sketchs.size());
--- a/src/common/random.h
+++ b/src/common/random.h
@ -127,7 +127,7 @@ class ColumnSampler {
  */
  ColumnSampler() {
    uint32_t seed = common::GlobalRandom()();
-    rabit::Broadcast(&seed, sizeof(seed), 0);
+    rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
    rng_.seed(seed);
  }
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
  /* sync up number of features after matrix loaded.
   * partitioned data will fail the train/val validation check
   * since partitioned data not knowing the real number of features. */
-  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
+  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
    nullptr, fname.c_str());
  // backward compatiblity code.
  if (!load_row_split) {
    MetaInfo& info = dmat->Info();
--- a/src/learner.cc
+++ b/src/learner.cc
@ -272,6 +272,9 @@ class LearnerImpl : public Learner {
            kv.second = "cpu_predictor";
            LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
          }
          if (saved_configs_.find(saved_param) != saved_configs_.end()) {
            cfg_[saved_param] = kv.second;
          }
        }
      }
      attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
@ -304,6 +307,10 @@ class LearnerImpl : public Learner {
      p_metric->Configure({cfg_.begin(), cfg_.end()});
    }
    // copy dsplit from config since it will not run again during restore
    if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
      tparam_.dsplit = DataSplitMode::kRow;
    }
    this->configured_ = true;
  }
@ -334,8 +341,15 @@ class LearnerImpl : public Learner {
      }
    }
    {
-      // Write `predictor`, `gpu_id` parameters as extra attributes
+      std::vector<std::string> saved_params{"predictor", "gpu_id"};
-      for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
+      // check if rabit_bootstrap_cache were set to non zero before adding to checkpoint
      if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() &&
        (cfg_.find("rabit_bootstrap_cache"))->second != "0") {
        std::copy(saved_configs_.begin(), saved_configs_.end(),
                  std::back_inserter(saved_params));
      }
      // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
      for (const auto& key : saved_params) {
        auto it = cfg_.find(key);
        if (it != cfg_.end()) {
          mparam.contain_extra_attrs = 1;
@ -603,7 +617,7 @@ class LearnerImpl : public Learner {
      num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
    }
    // run allreduce on num_feature to find the maximum value
-    rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
+    rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr, "num_feature");
    if (num_feature > mparam_.num_feature) {
      mparam_.num_feature = num_feature;
    }
@ -650,6 +664,10 @@ class LearnerImpl : public Learner {
  std::vector<std::shared_ptr<DMatrix> > cache_;
  common::Monitor monitor_;
  /*! \brief saved config keys used to restore failed worker */
  std::set<std::string> saved_configs_ = {"max_depth", "tree_method", "dsplit",
    "seed", "silent", "num_round", "gamma", "min_child_weight"};
 };
 std::string const LearnerImpl::kEvalMetric {"eval_metric"};  // NOLINT
--- a/tests/ci_build/approx.conf.in
+++ b/tests/ci_build/approx.conf.in
@ -0,0 +1,12 @@
 # Originally an example in demo/regression/
 tree_method=approx
 eta = 0.5 
 gamma = 1.0
 seed = 0
 min_child_weight = 0
 max_depth = 5 
 num_round = 12 
 save_period = 100 
 data = "demo/data/agaricus.txt.train"
 eval[test] = "demo/data/agaricus.txt.test"
--- a/tests/ci_build/build_mock_cmake.sh
+++ b/tests/ci_build/build_mock_cmake.sh
@ -0,0 +1,10 @@
 #!/usr/bin/env bash
 set -e
 rm -rf build
 mkdir build
 cd build
 cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
 make clean
 make -j$(nproc)
 cd ..
--- a/tests/ci_build/runxgb.sh
+++ b/tests/ci_build/runxgb.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 #run make in rabit/test to generate librabit_mock
 #update config.mk and build xgboost using mock
 export DMLC_SUBMIT_CLUSTER=local
 submit="python3 dmlc-core/tracker/dmlc-submit"
 # build xgboost with librabit mock
 # define max worker retry with dmlc-core local num atempt
 # instrument worker failure with mock=xxxx
 # check if host recovered from expectected iteration
 echo "====== 1. Fault recovery distributed test ======"
 exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1
		`@ -1 +1 @@`
			`Subproject commit dba32d54d1668033356a2ad505c239411d660821`				`Subproject commit 9a7ac85d7eb65b1a0b904e1fa8d5a01b910adda4`