[rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808)
* Better recovery support. Restarting only the failed workers.
This commit is contained in:
parent
c89bcc4de5
commit
512f037e55
@ -1,7 +1,7 @@
|
||||
# disable sudo for container build.
|
||||
sudo: required
|
||||
|
||||
# Enabling test on Linux and OS X
|
||||
# Enabling test OS X
|
||||
os:
|
||||
- osx
|
||||
|
||||
|
||||
@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF)
|
||||
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
|
||||
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
|
||||
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
|
||||
option(RABIT_MOCK "Build rabit with mock" OFF)
|
||||
## CUDA
|
||||
option(USE_CUDA "Build with GPU acceleration" OFF)
|
||||
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
||||
@ -88,17 +89,25 @@ list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)
|
||||
|
||||
# rabit
|
||||
# full rabit doesn't build on windows, so we can't import it as subdirectory
|
||||
if(MINGW OR R_LIB)
|
||||
if(MINGW OR R_LIB OR WIN32)
|
||||
set(RABIT_SOURCES
|
||||
rabit/src/engine_empty.cc
|
||||
rabit/src/c_api.cc)
|
||||
else ()
|
||||
if(RABIT_MOCK)
|
||||
set(RABIT_SOURCES
|
||||
rabit/src/allreduce_base.cc
|
||||
rabit/src/allreduce_robust.cc
|
||||
rabit/src/engine_mock.cc
|
||||
rabit/src/c_api.cc)
|
||||
else()
|
||||
set(RABIT_SOURCES
|
||||
rabit/src/allreduce_base.cc
|
||||
rabit/src/allreduce_robust.cc
|
||||
rabit/src/engine.cc
|
||||
rabit/src/c_api.cc)
|
||||
endif (MINGW OR R_LIB)
|
||||
endif(RABIT_MOCK)
|
||||
endif (MINGW OR R_LIB OR WIN32)
|
||||
add_library(rabit STATIC ${RABIT_SOURCES})
|
||||
target_include_directories(rabit PRIVATE
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>
|
||||
|
||||
32
Jenkinsfile
vendored
32
Jenkinsfile
vendored
@ -56,6 +56,7 @@ pipeline {
|
||||
script {
|
||||
parallel ([
|
||||
'build-cpu': { BuildCPU() },
|
||||
'build-cpu-rabit-mock': { BuildCPUMock() },
|
||||
'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
|
||||
'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
|
||||
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
|
||||
@ -76,6 +77,7 @@ pipeline {
|
||||
'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
|
||||
'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
|
||||
'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
|
||||
'test-cpp-rabit': {TestCppRabit()},
|
||||
'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
|
||||
'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
|
||||
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
|
||||
@ -185,6 +187,22 @@ def BuildCPU() {
|
||||
}
|
||||
}
|
||||
|
||||
def BuildCPUMock() {
|
||||
node('linux && cpu') {
|
||||
unstash name: 'srcs'
|
||||
echo "Build CPU with rabit mock"
|
||||
def container_type = "cpu"
|
||||
def docker_binary = "docker"
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh
|
||||
"""
|
||||
echo 'Stashing rabit C++ test executable (xgboost)...'
|
||||
stash name: 'xgboost_rabit_tests', includes: 'xgboost'
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def BuildCUDA(args) {
|
||||
node('linux && cpu') {
|
||||
unstash name: 'srcs'
|
||||
@ -279,6 +297,20 @@ def TestPythonGPU(args) {
|
||||
}
|
||||
}
|
||||
|
||||
def TestCppRabit() {
|
||||
node(nodeReq) {
|
||||
unstash name: 'xgboost_rabit_tests'
|
||||
unstash name: 'srcs'
|
||||
echo "Test C++, rabit mock on"
|
||||
def container_type = "cpu"
|
||||
def docker_binary = "docker"
|
||||
sh """
|
||||
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in
|
||||
"""
|
||||
deleteDir()
|
||||
}
|
||||
}
|
||||
|
||||
def TestCppGPU(args) {
|
||||
nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
|
||||
node(nodeReq) {
|
||||
|
||||
2
rabit
2
rabit
@ -1 +1 @@
|
||||
Subproject commit dba32d54d1668033356a2ad505c239411d660821
|
||||
Subproject commit 9a7ac85d7eb65b1a0b904e1fa8d5a01b910adda4
|
||||
@ -303,6 +303,8 @@ void DenseCuts::Init
|
||||
}
|
||||
CHECK_EQ(summary_array.size(), in_sketchs->size());
|
||||
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
|
||||
// TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
|
||||
// we need to move this allreduce before loadcheckpoint call in future
|
||||
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
||||
p_cuts_->min_vals_.resize(sketchs.size());
|
||||
|
||||
|
||||
@ -127,7 +127,7 @@ class ColumnSampler {
|
||||
*/
|
||||
ColumnSampler() {
|
||||
uint32_t seed = common::GlobalRandom()();
|
||||
rabit::Broadcast(&seed, sizeof(seed), 0);
|
||||
rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
|
||||
rng_.seed(seed);
|
||||
}
|
||||
|
||||
|
||||
@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
/* sync up number of features after matrix loaded.
|
||||
* partitioned data will fail the train/val validation check
|
||||
* since partitioned data not knowing the real number of features. */
|
||||
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
|
||||
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
|
||||
nullptr, fname.c_str());
|
||||
// backward compatiblity code.
|
||||
if (!load_row_split) {
|
||||
MetaInfo& info = dmat->Info();
|
||||
|
||||
@ -272,6 +272,9 @@ class LearnerImpl : public Learner {
|
||||
kv.second = "cpu_predictor";
|
||||
LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
|
||||
}
|
||||
if (saved_configs_.find(saved_param) != saved_configs_.end()) {
|
||||
cfg_[saved_param] = kv.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
|
||||
@ -304,6 +307,10 @@ class LearnerImpl : public Learner {
|
||||
p_metric->Configure({cfg_.begin(), cfg_.end()});
|
||||
}
|
||||
|
||||
// copy dsplit from config since it will not run again during restore
|
||||
if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
|
||||
tparam_.dsplit = DataSplitMode::kRow;
|
||||
}
|
||||
this->configured_ = true;
|
||||
}
|
||||
|
||||
@ -334,8 +341,15 @@ class LearnerImpl : public Learner {
|
||||
}
|
||||
}
|
||||
{
|
||||
// Write `predictor`, `gpu_id` parameters as extra attributes
|
||||
for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
|
||||
std::vector<std::string> saved_params{"predictor", "gpu_id"};
|
||||
// check if rabit_bootstrap_cache were set to non zero before adding to checkpoint
|
||||
if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() &&
|
||||
(cfg_.find("rabit_bootstrap_cache"))->second != "0") {
|
||||
std::copy(saved_configs_.begin(), saved_configs_.end(),
|
||||
std::back_inserter(saved_params));
|
||||
}
|
||||
// Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
|
||||
for (const auto& key : saved_params) {
|
||||
auto it = cfg_.find(key);
|
||||
if (it != cfg_.end()) {
|
||||
mparam.contain_extra_attrs = 1;
|
||||
@ -603,7 +617,7 @@ class LearnerImpl : public Learner {
|
||||
num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
|
||||
}
|
||||
// run allreduce on num_feature to find the maximum value
|
||||
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
|
||||
rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr, "num_feature");
|
||||
if (num_feature > mparam_.num_feature) {
|
||||
mparam_.num_feature = num_feature;
|
||||
}
|
||||
@ -650,6 +664,10 @@ class LearnerImpl : public Learner {
|
||||
std::vector<std::shared_ptr<DMatrix> > cache_;
|
||||
|
||||
common::Monitor monitor_;
|
||||
|
||||
/*! \brief saved config keys used to restore failed worker */
|
||||
std::set<std::string> saved_configs_ = {"max_depth", "tree_method", "dsplit",
|
||||
"seed", "silent", "num_round", "gamma", "min_child_weight"};
|
||||
};
|
||||
|
||||
std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT
|
||||
|
||||
12
tests/ci_build/approx.conf.in
Normal file
12
tests/ci_build/approx.conf.in
Normal file
@ -0,0 +1,12 @@
|
||||
# Originally an example in demo/regression/
|
||||
tree_method=approx
|
||||
eta = 0.5
|
||||
gamma = 1.0
|
||||
seed = 0
|
||||
min_child_weight = 0
|
||||
max_depth = 5
|
||||
|
||||
num_round = 12
|
||||
save_period = 100
|
||||
data = "demo/data/agaricus.txt.train"
|
||||
eval[test] = "demo/data/agaricus.txt.test"
|
||||
10
tests/ci_build/build_mock_cmake.sh
Executable file
10
tests/ci_build/build_mock_cmake.sh
Executable file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
rm -rf build
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
|
||||
make clean
|
||||
make -j$(nproc)
|
||||
cd ..
|
||||
13
tests/ci_build/runxgb.sh
Executable file
13
tests/ci_build/runxgb.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
#run make in rabit/test to generate librabit_mock
|
||||
#update config.mk and build xgboost using mock
|
||||
export DMLC_SUBMIT_CLUSTER=local
|
||||
|
||||
submit="python3 dmlc-core/tracker/dmlc-submit"
|
||||
# build xgboost with librabit mock
|
||||
# define max worker retry with dmlc-core local num atempt
|
||||
# instrument worker failure with mock=xxxx
|
||||
# check if host recovered from expectected iteration
|
||||
echo "====== 1. Fault recovery distributed test ======"
|
||||
exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1
|
||||
Loading…
x
Reference in New Issue
Block a user