[rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808)

* Better recovery support.  Restarting only the failed workers.
This commit is contained in:
Chen Qin 2019-09-16 20:31:52 -07:00 committed by Jiaming Yuan
parent c89bcc4de5
commit 512f037e55
11 changed files with 111 additions and 14 deletions

View File

@ -1,7 +1,7 @@
# disable sudo for container build. # disable sudo for container build.
sudo: required sudo: required
# Enabling test on Linux and OS X # Enabling test OS X
os: os:
- osx - osx

View File

@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF)
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF) option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF) option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header") set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
option(RABIT_MOCK "Build rabit with mock" OFF)
## CUDA ## CUDA
option(USE_CUDA "Build with GPU acceleration" OFF) option(USE_CUDA "Build with GPU acceleration" OFF)
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF) option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
@ -88,17 +89,25 @@ list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)
# rabit # rabit
# full rabit doesn't build on windows, so we can't import it as subdirectory # full rabit doesn't build on windows, so we can't import it as subdirectory
if(MINGW OR R_LIB) if(MINGW OR R_LIB OR WIN32)
set(RABIT_SOURCES set(RABIT_SOURCES
rabit/src/engine_empty.cc rabit/src/engine_empty.cc
rabit/src/c_api.cc) rabit/src/c_api.cc)
else () else ()
set(RABIT_SOURCES if(RABIT_MOCK)
rabit/src/allreduce_base.cc set(RABIT_SOURCES
rabit/src/allreduce_robust.cc rabit/src/allreduce_base.cc
rabit/src/engine.cc rabit/src/allreduce_robust.cc
rabit/src/c_api.cc) rabit/src/engine_mock.cc
endif (MINGW OR R_LIB) rabit/src/c_api.cc)
else()
set(RABIT_SOURCES
rabit/src/allreduce_base.cc
rabit/src/allreduce_robust.cc
rabit/src/engine.cc
rabit/src/c_api.cc)
endif(RABIT_MOCK)
endif (MINGW OR R_LIB OR WIN32)
add_library(rabit STATIC ${RABIT_SOURCES}) add_library(rabit STATIC ${RABIT_SOURCES})
target_include_directories(rabit PRIVATE target_include_directories(rabit PRIVATE
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include> $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>

32
Jenkinsfile vendored
View File

@ -56,6 +56,7 @@ pipeline {
script { script {
parallel ([ parallel ([
'build-cpu': { BuildCPU() }, 'build-cpu': { BuildCPU() },
'build-cpu-rabit-mock': { BuildCPUMock() },
'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') }, 'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
@ -76,6 +77,7 @@ pipeline {
'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') }, 'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') }, 'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) }, 'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
'test-cpp-rabit': {TestCppRabit()},
'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') }, 'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) }, 'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') }, 'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
@ -185,6 +187,22 @@ def BuildCPU() {
} }
} }
def BuildCPUMock() {
node('linux && cpu') {
unstash name: 'srcs'
echo "Build CPU with rabit mock"
def container_type = "cpu"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh
"""
echo 'Stashing rabit C++ test executable (xgboost)...'
stash name: 'xgboost_rabit_tests', includes: 'xgboost'
deleteDir()
}
}
def BuildCUDA(args) { def BuildCUDA(args) {
node('linux && cpu') { node('linux && cpu') {
unstash name: 'srcs' unstash name: 'srcs'
@ -279,6 +297,20 @@ def TestPythonGPU(args) {
} }
} }
def TestCppRabit() {
node(nodeReq) {
unstash name: 'xgboost_rabit_tests'
unstash name: 'srcs'
echo "Test C++, rabit mock on"
def container_type = "cpu"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in
"""
deleteDir()
}
}
def TestCppGPU(args) { def TestCppGPU(args) {
nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu' nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
node(nodeReq) { node(nodeReq) {

2
rabit

@ -1 +1 @@
Subproject commit dba32d54d1668033356a2ad505c239411d660821 Subproject commit 9a7ac85d7eb65b1a0b904e1fa8d5a01b910adda4

View File

@ -303,6 +303,8 @@ void DenseCuts::Init
} }
CHECK_EQ(summary_array.size(), in_sketchs->size()); CHECK_EQ(summary_array.size(), in_sketchs->size());
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor); size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
// TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
// we need to move this allreduce before loadcheckpoint call in future
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size()); sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
p_cuts_->min_vals_.resize(sketchs.size()); p_cuts_->min_vals_.resize(sketchs.size());

View File

@ -127,7 +127,7 @@ class ColumnSampler {
*/ */
ColumnSampler() { ColumnSampler() {
uint32_t seed = common::GlobalRandom()(); uint32_t seed = common::GlobalRandom()();
rabit::Broadcast(&seed, sizeof(seed), 0); rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
rng_.seed(seed); rng_.seed(seed);
} }

View File

@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
/* sync up number of features after matrix loaded. /* sync up number of features after matrix loaded.
* partitioned data will fail the train/val validation check * partitioned data will fail the train/val validation check
* since partitioned data not knowing the real number of features. */ * since partitioned data not knowing the real number of features. */
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1); rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
nullptr, fname.c_str());
// backward compatiblity code. // backward compatiblity code.
if (!load_row_split) { if (!load_row_split) {
MetaInfo& info = dmat->Info(); MetaInfo& info = dmat->Info();

View File

@ -272,6 +272,9 @@ class LearnerImpl : public Learner {
kv.second = "cpu_predictor"; kv.second = "cpu_predictor";
LOG(INFO) << "Switch gpu_predictor to cpu_predictor."; LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
} }
if (saved_configs_.find(saved_param) != saved_configs_.end()) {
cfg_[saved_param] = kv.second;
}
} }
} }
attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end()); attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
@ -304,6 +307,10 @@ class LearnerImpl : public Learner {
p_metric->Configure({cfg_.begin(), cfg_.end()}); p_metric->Configure({cfg_.begin(), cfg_.end()});
} }
// copy dsplit from config since it will not run again during restore
if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
tparam_.dsplit = DataSplitMode::kRow;
}
this->configured_ = true; this->configured_ = true;
} }
@ -334,8 +341,15 @@ class LearnerImpl : public Learner {
} }
} }
{ {
// Write `predictor`, `gpu_id` parameters as extra attributes std::vector<std::string> saved_params{"predictor", "gpu_id"};
for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) { // check if rabit_bootstrap_cache were set to non zero before adding to checkpoint
if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() &&
(cfg_.find("rabit_bootstrap_cache"))->second != "0") {
std::copy(saved_configs_.begin(), saved_configs_.end(),
std::back_inserter(saved_params));
}
// Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
for (const auto& key : saved_params) {
auto it = cfg_.find(key); auto it = cfg_.find(key);
if (it != cfg_.end()) { if (it != cfg_.end()) {
mparam.contain_extra_attrs = 1; mparam.contain_extra_attrs = 1;
@ -603,7 +617,7 @@ class LearnerImpl : public Learner {
num_feature = std::max(num_feature, static_cast<unsigned>(num_col)); num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
} }
// run allreduce on num_feature to find the maximum value // run allreduce on num_feature to find the maximum value
rabit::Allreduce<rabit::op::Max>(&num_feature, 1); rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr, "num_feature");
if (num_feature > mparam_.num_feature) { if (num_feature > mparam_.num_feature) {
mparam_.num_feature = num_feature; mparam_.num_feature = num_feature;
} }
@ -650,6 +664,10 @@ class LearnerImpl : public Learner {
std::vector<std::shared_ptr<DMatrix> > cache_; std::vector<std::shared_ptr<DMatrix> > cache_;
common::Monitor monitor_; common::Monitor monitor_;
/*! \brief saved config keys used to restore failed worker */
std::set<std::string> saved_configs_ = {"max_depth", "tree_method", "dsplit",
"seed", "silent", "num_round", "gamma", "min_child_weight"};
}; };
std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT

View File

@ -0,0 +1,12 @@
# Originally an example in demo/regression/
tree_method=approx
eta = 0.5
gamma = 1.0
seed = 0
min_child_weight = 0
max_depth = 5
num_round = 12
save_period = 100
data = "demo/data/agaricus.txt.train"
eval[test] = "demo/data/agaricus.txt.test"

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -e
rm -rf build
mkdir build
cd build
cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
make clean
make -j$(nproc)
cd ..

13
tests/ci_build/runxgb.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
#run make in rabit/test to generate librabit_mock
#update config.mk and build xgboost using mock
export DMLC_SUBMIT_CLUSTER=local
submit="python3 dmlc-core/tracker/dmlc-submit"
# build xgboost with librabit mock
# define max worker retry with dmlc-core local num atempt
# instrument worker failure with mock=xxxx
# check if host recovered from expectected iteration
echo "====== 1. Fault recovery distributed test ======"
exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1