From 512f037e55525c6a320237a98a9e628a86c92227 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Mon, 16 Sep 2019 20:31:52 -0700 Subject: [PATCH] [rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808) * Better recovery support. Restarting only the failed workers. --- .travis.yml | 2 +- CMakeLists.txt | 23 ++++++++++++++------- Jenkinsfile | 32 ++++++++++++++++++++++++++++++ rabit | 2 +- src/common/hist_util.cc | 2 ++ src/common/random.h | 2 +- src/data/data.cc | 3 ++- src/learner.cc | 24 +++++++++++++++++++--- tests/ci_build/approx.conf.in | 12 +++++++++++ tests/ci_build/build_mock_cmake.sh | 10 ++++++++++ tests/ci_build/runxgb.sh | 13 ++++++++++++ 11 files changed, 111 insertions(+), 14 deletions(-) create mode 100644 tests/ci_build/approx.conf.in create mode 100755 tests/ci_build/build_mock_cmake.sh create mode 100755 tests/ci_build/runxgb.sh diff --git a/.travis.yml b/.travis.yml index d3925cf03..513bff0b8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ # disable sudo for container build. sudo: required -# Enabling test on Linux and OS X +# Enabling test OS X os: - osx diff --git a/CMakeLists.txt b/CMakeLists.txt index 92c78917c..b982a96ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF) option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF) option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF) set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header") +option(RABIT_MOCK "Build rabit with mock" OFF) ## CUDA option(USE_CUDA "Build with GPU acceleration" OFF) option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) @@ -88,17 +89,25 @@ list(APPEND LINKED_LIBRARIES_PRIVATE dmlc) # rabit # full rabit doesn't build on windows, so we can't import it as subdirectory -if(MINGW OR R_LIB) +if(MINGW OR R_LIB OR WIN32) set(RABIT_SOURCES rabit/src/engine_empty.cc rabit/src/c_api.cc) else () - set(RABIT_SOURCES - rabit/src/allreduce_base.cc - rabit/src/allreduce_robust.cc - rabit/src/engine.cc - rabit/src/c_api.cc) -endif (MINGW OR R_LIB) + if(RABIT_MOCK) + set(RABIT_SOURCES + rabit/src/allreduce_base.cc + rabit/src/allreduce_robust.cc + rabit/src/engine_mock.cc + rabit/src/c_api.cc) + else() + set(RABIT_SOURCES + rabit/src/allreduce_base.cc + rabit/src/allreduce_robust.cc + rabit/src/engine.cc + rabit/src/c_api.cc) + endif(RABIT_MOCK) +endif (MINGW OR R_LIB OR WIN32) add_library(rabit STATIC ${RABIT_SOURCES}) target_include_directories(rabit PRIVATE $ diff --git a/Jenkinsfile b/Jenkinsfile index 2ea986372..6eab50fe6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -56,6 +56,7 @@ pipeline { script { parallel ([ 'build-cpu': { BuildCPU() }, + 'build-cpu-rabit-mock': { BuildCPUMock() }, 'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') }, 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, @@ -76,6 +77,7 @@ pipeline { 'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') }, 'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') }, 'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) }, + 'test-cpp-rabit': {TestCppRabit()}, 'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') }, 'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) }, 'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') }, @@ -185,6 +187,22 @@ def BuildCPU() { } } +def BuildCPUMock() { + node('linux && cpu') { + unstash name: 'srcs' + echo "Build CPU with rabit mock" + def container_type = "cpu" + def docker_binary = "docker" + sh """ + ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh + """ + echo 'Stashing rabit C++ test executable (xgboost)...' + stash name: 'xgboost_rabit_tests', includes: 'xgboost' + deleteDir() + } +} + + def BuildCUDA(args) { node('linux && cpu') { unstash name: 'srcs' @@ -279,6 +297,20 @@ def TestPythonGPU(args) { } } +def TestCppRabit() { + node(nodeReq) { + unstash name: 'xgboost_rabit_tests' + unstash name: 'srcs' + echo "Test C++, rabit mock on" + def container_type = "cpu" + def docker_binary = "docker" + sh """ + ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in + """ + deleteDir() + } +} + def TestCppGPU(args) { nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu' node(nodeReq) { diff --git a/rabit b/rabit index dba32d54d..9a7ac85d7 160000 --- a/rabit +++ b/rabit @@ -1 +1 @@ -Subproject commit dba32d54d1668033356a2ad505c239411d660821 +Subproject commit 9a7ac85d7eb65b1a0b904e1fa8d5a01b910adda4 diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index a842f554c..678bee018 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -303,6 +303,8 @@ void DenseCuts::Init } CHECK_EQ(summary_array.size(), in_sketchs->size()); size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor); + // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint + // we need to move this allreduce before loadcheckpoint call in future sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size()); p_cuts_->min_vals_.resize(sketchs.size()); diff --git a/src/common/random.h b/src/common/random.h index d81d76a82..56c6a2b7e 100644 --- a/src/common/random.h +++ b/src/common/random.h @@ -127,7 +127,7 @@ class ColumnSampler { */ ColumnSampler() { uint32_t seed = common::GlobalRandom()(); - rabit::Broadcast(&seed, sizeof(seed), 0); + rabit::Broadcast(&seed, sizeof(seed), 0, "seed"); rng_.seed(seed); } diff --git a/src/data/data.cc b/src/data/data.cc index 9303c836a..d7d58d58c 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri, /* sync up number of features after matrix loaded. * partitioned data will fail the train/val validation check * since partitioned data not knowing the real number of features. */ - rabit::Allreduce(&dmat->Info().num_col_, 1); + rabit::Allreduce(&dmat->Info().num_col_, 1, nullptr, + nullptr, fname.c_str()); // backward compatiblity code. if (!load_row_split) { MetaInfo& info = dmat->Info(); diff --git a/src/learner.cc b/src/learner.cc index 7967c081b..6d45c1e62 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -272,6 +272,9 @@ class LearnerImpl : public Learner { kv.second = "cpu_predictor"; LOG(INFO) << "Switch gpu_predictor to cpu_predictor."; } + if (saved_configs_.find(saved_param) != saved_configs_.end()) { + cfg_[saved_param] = kv.second; + } } } attributes_ = std::map(attr.begin(), attr.end()); @@ -304,6 +307,10 @@ class LearnerImpl : public Learner { p_metric->Configure({cfg_.begin(), cfg_.end()}); } + // copy dsplit from config since it will not run again during restore + if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) { + tparam_.dsplit = DataSplitMode::kRow; + } this->configured_ = true; } @@ -334,8 +341,15 @@ class LearnerImpl : public Learner { } } { - // Write `predictor`, `gpu_id` parameters as extra attributes - for (const auto& key : std::vector{"predictor", "gpu_id"}) { + std::vector saved_params{"predictor", "gpu_id"}; + // check if rabit_bootstrap_cache were set to non zero before adding to checkpoint + if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() && + (cfg_.find("rabit_bootstrap_cache"))->second != "0") { + std::copy(saved_configs_.begin(), saved_configs_.end(), + std::back_inserter(saved_params)); + } + // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes + for (const auto& key : saved_params) { auto it = cfg_.find(key); if (it != cfg_.end()) { mparam.contain_extra_attrs = 1; @@ -603,7 +617,7 @@ class LearnerImpl : public Learner { num_feature = std::max(num_feature, static_cast(num_col)); } // run allreduce on num_feature to find the maximum value - rabit::Allreduce(&num_feature, 1); + rabit::Allreduce(&num_feature, 1, nullptr, nullptr, "num_feature"); if (num_feature > mparam_.num_feature) { mparam_.num_feature = num_feature; } @@ -650,6 +664,10 @@ class LearnerImpl : public Learner { std::vector > cache_; common::Monitor monitor_; + + /*! \brief saved config keys used to restore failed worker */ + std::set saved_configs_ = {"max_depth", "tree_method", "dsplit", + "seed", "silent", "num_round", "gamma", "min_child_weight"}; }; std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT diff --git a/tests/ci_build/approx.conf.in b/tests/ci_build/approx.conf.in new file mode 100644 index 000000000..b2321a79a --- /dev/null +++ b/tests/ci_build/approx.conf.in @@ -0,0 +1,12 @@ +# Originally an example in demo/regression/ +tree_method=approx +eta = 0.5 +gamma = 1.0 +seed = 0 +min_child_weight = 0 +max_depth = 5 + +num_round = 12 +save_period = 100 +data = "demo/data/agaricus.txt.train" +eval[test] = "demo/data/agaricus.txt.test" diff --git a/tests/ci_build/build_mock_cmake.sh b/tests/ci_build/build_mock_cmake.sh new file mode 100755 index 000000000..8cbabd036 --- /dev/null +++ b/tests/ci_build/build_mock_cmake.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -e + +rm -rf build +mkdir build +cd build +cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON .. +make clean +make -j$(nproc) +cd .. diff --git a/tests/ci_build/runxgb.sh b/tests/ci_build/runxgb.sh new file mode 100755 index 000000000..2ed450dfa --- /dev/null +++ b/tests/ci_build/runxgb.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +#run make in rabit/test to generate librabit_mock +#update config.mk and build xgboost using mock +export DMLC_SUBMIT_CLUSTER=local + +submit="python3 dmlc-core/tracker/dmlc-submit" +# build xgboost with librabit mock +# define max worker retry with dmlc-core local num atempt +# instrument worker failure with mock=xxxx +# check if host recovered from expectected iteration +echo "====== 1. Fault recovery distributed test ======" +exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1