[rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808)
* Better recovery support. Restarting only the failed workers.
This commit is contained in:
parent
c89bcc4de5
commit
512f037e55
@ -1,7 +1,7 @@
|
|||||||
# disable sudo for container build.
|
# disable sudo for container build.
|
||||||
sudo: required
|
sudo: required
|
||||||
|
|
||||||
# Enabling test on Linux and OS X
|
# Enabling test OS X
|
||||||
os:
|
os:
|
||||||
- osx
|
- osx
|
||||||
|
|
||||||
|
|||||||
@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF)
|
|||||||
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
|
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
|
||||||
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
|
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
|
||||||
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
|
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
|
||||||
|
option(RABIT_MOCK "Build rabit with mock" OFF)
|
||||||
## CUDA
|
## CUDA
|
||||||
option(USE_CUDA "Build with GPU acceleration" OFF)
|
option(USE_CUDA "Build with GPU acceleration" OFF)
|
||||||
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
|
||||||
@ -88,17 +89,25 @@ list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)
|
|||||||
|
|
||||||
# rabit
|
# rabit
|
||||||
# full rabit doesn't build on windows, so we can't import it as subdirectory
|
# full rabit doesn't build on windows, so we can't import it as subdirectory
|
||||||
if(MINGW OR R_LIB)
|
if(MINGW OR R_LIB OR WIN32)
|
||||||
set(RABIT_SOURCES
|
set(RABIT_SOURCES
|
||||||
rabit/src/engine_empty.cc
|
rabit/src/engine_empty.cc
|
||||||
rabit/src/c_api.cc)
|
rabit/src/c_api.cc)
|
||||||
|
else ()
|
||||||
|
if(RABIT_MOCK)
|
||||||
|
set(RABIT_SOURCES
|
||||||
|
rabit/src/allreduce_base.cc
|
||||||
|
rabit/src/allreduce_robust.cc
|
||||||
|
rabit/src/engine_mock.cc
|
||||||
|
rabit/src/c_api.cc)
|
||||||
else()
|
else()
|
||||||
set(RABIT_SOURCES
|
set(RABIT_SOURCES
|
||||||
rabit/src/allreduce_base.cc
|
rabit/src/allreduce_base.cc
|
||||||
rabit/src/allreduce_robust.cc
|
rabit/src/allreduce_robust.cc
|
||||||
rabit/src/engine.cc
|
rabit/src/engine.cc
|
||||||
rabit/src/c_api.cc)
|
rabit/src/c_api.cc)
|
||||||
endif (MINGW OR R_LIB)
|
endif(RABIT_MOCK)
|
||||||
|
endif (MINGW OR R_LIB OR WIN32)
|
||||||
add_library(rabit STATIC ${RABIT_SOURCES})
|
add_library(rabit STATIC ${RABIT_SOURCES})
|
||||||
target_include_directories(rabit PRIVATE
|
target_include_directories(rabit PRIVATE
|
||||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>
|
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>
|
||||||
|
|||||||
32
Jenkinsfile
vendored
32
Jenkinsfile
vendored
@ -56,6 +56,7 @@ pipeline {
|
|||||||
script {
|
script {
|
||||||
parallel ([
|
parallel ([
|
||||||
'build-cpu': { BuildCPU() },
|
'build-cpu': { BuildCPU() },
|
||||||
|
'build-cpu-rabit-mock': { BuildCPUMock() },
|
||||||
'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
|
'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
|
||||||
'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
|
'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
|
||||||
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
|
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
|
||||||
@ -76,6 +77,7 @@ pipeline {
|
|||||||
'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
|
'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
|
||||||
'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
|
'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
|
||||||
'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
|
'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
|
||||||
|
'test-cpp-rabit': {TestCppRabit()},
|
||||||
'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
|
'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
|
||||||
'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
|
'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
|
||||||
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
|
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
|
||||||
@ -185,6 +187,22 @@ def BuildCPU() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def BuildCPUMock() {
|
||||||
|
node('linux && cpu') {
|
||||||
|
unstash name: 'srcs'
|
||||||
|
echo "Build CPU with rabit mock"
|
||||||
|
def container_type = "cpu"
|
||||||
|
def docker_binary = "docker"
|
||||||
|
sh """
|
||||||
|
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh
|
||||||
|
"""
|
||||||
|
echo 'Stashing rabit C++ test executable (xgboost)...'
|
||||||
|
stash name: 'xgboost_rabit_tests', includes: 'xgboost'
|
||||||
|
deleteDir()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def BuildCUDA(args) {
|
def BuildCUDA(args) {
|
||||||
node('linux && cpu') {
|
node('linux && cpu') {
|
||||||
unstash name: 'srcs'
|
unstash name: 'srcs'
|
||||||
@ -279,6 +297,20 @@ def TestPythonGPU(args) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def TestCppRabit() {
|
||||||
|
node(nodeReq) {
|
||||||
|
unstash name: 'xgboost_rabit_tests'
|
||||||
|
unstash name: 'srcs'
|
||||||
|
echo "Test C++, rabit mock on"
|
||||||
|
def container_type = "cpu"
|
||||||
|
def docker_binary = "docker"
|
||||||
|
sh """
|
||||||
|
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in
|
||||||
|
"""
|
||||||
|
deleteDir()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
def TestCppGPU(args) {
|
def TestCppGPU(args) {
|
||||||
nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
|
nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
|
||||||
node(nodeReq) {
|
node(nodeReq) {
|
||||||
|
|||||||
2
rabit
2
rabit
@ -1 +1 @@
|
|||||||
Subproject commit dba32d54d1668033356a2ad505c239411d660821
|
Subproject commit 9a7ac85d7eb65b1a0b904e1fa8d5a01b910adda4
|
||||||
@ -303,6 +303,8 @@ void DenseCuts::Init
|
|||||||
}
|
}
|
||||||
CHECK_EQ(summary_array.size(), in_sketchs->size());
|
CHECK_EQ(summary_array.size(), in_sketchs->size());
|
||||||
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
|
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
|
||||||
|
// TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
|
||||||
|
// we need to move this allreduce before loadcheckpoint call in future
|
||||||
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
||||||
p_cuts_->min_vals_.resize(sketchs.size());
|
p_cuts_->min_vals_.resize(sketchs.size());
|
||||||
|
|
||||||
|
|||||||
@ -127,7 +127,7 @@ class ColumnSampler {
|
|||||||
*/
|
*/
|
||||||
ColumnSampler() {
|
ColumnSampler() {
|
||||||
uint32_t seed = common::GlobalRandom()();
|
uint32_t seed = common::GlobalRandom()();
|
||||||
rabit::Broadcast(&seed, sizeof(seed), 0);
|
rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
|
||||||
rng_.seed(seed);
|
rng_.seed(seed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
|||||||
/* sync up number of features after matrix loaded.
|
/* sync up number of features after matrix loaded.
|
||||||
* partitioned data will fail the train/val validation check
|
* partitioned data will fail the train/val validation check
|
||||||
* since partitioned data not knowing the real number of features. */
|
* since partitioned data not knowing the real number of features. */
|
||||||
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
|
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
|
||||||
|
nullptr, fname.c_str());
|
||||||
// backward compatiblity code.
|
// backward compatiblity code.
|
||||||
if (!load_row_split) {
|
if (!load_row_split) {
|
||||||
MetaInfo& info = dmat->Info();
|
MetaInfo& info = dmat->Info();
|
||||||
|
|||||||
@ -272,6 +272,9 @@ class LearnerImpl : public Learner {
|
|||||||
kv.second = "cpu_predictor";
|
kv.second = "cpu_predictor";
|
||||||
LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
|
LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
|
||||||
}
|
}
|
||||||
|
if (saved_configs_.find(saved_param) != saved_configs_.end()) {
|
||||||
|
cfg_[saved_param] = kv.second;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
|
attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
|
||||||
@ -304,6 +307,10 @@ class LearnerImpl : public Learner {
|
|||||||
p_metric->Configure({cfg_.begin(), cfg_.end()});
|
p_metric->Configure({cfg_.begin(), cfg_.end()});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// copy dsplit from config since it will not run again during restore
|
||||||
|
if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
|
||||||
|
tparam_.dsplit = DataSplitMode::kRow;
|
||||||
|
}
|
||||||
this->configured_ = true;
|
this->configured_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -334,8 +341,15 @@ class LearnerImpl : public Learner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Write `predictor`, `gpu_id` parameters as extra attributes
|
std::vector<std::string> saved_params{"predictor", "gpu_id"};
|
||||||
for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
|
// check if rabit_bootstrap_cache were set to non zero before adding to checkpoint
|
||||||
|
if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() &&
|
||||||
|
(cfg_.find("rabit_bootstrap_cache"))->second != "0") {
|
||||||
|
std::copy(saved_configs_.begin(), saved_configs_.end(),
|
||||||
|
std::back_inserter(saved_params));
|
||||||
|
}
|
||||||
|
// Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
|
||||||
|
for (const auto& key : saved_params) {
|
||||||
auto it = cfg_.find(key);
|
auto it = cfg_.find(key);
|
||||||
if (it != cfg_.end()) {
|
if (it != cfg_.end()) {
|
||||||
mparam.contain_extra_attrs = 1;
|
mparam.contain_extra_attrs = 1;
|
||||||
@ -603,7 +617,7 @@ class LearnerImpl : public Learner {
|
|||||||
num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
|
num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
|
||||||
}
|
}
|
||||||
// run allreduce on num_feature to find the maximum value
|
// run allreduce on num_feature to find the maximum value
|
||||||
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
|
rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr, "num_feature");
|
||||||
if (num_feature > mparam_.num_feature) {
|
if (num_feature > mparam_.num_feature) {
|
||||||
mparam_.num_feature = num_feature;
|
mparam_.num_feature = num_feature;
|
||||||
}
|
}
|
||||||
@ -650,6 +664,10 @@ class LearnerImpl : public Learner {
|
|||||||
std::vector<std::shared_ptr<DMatrix> > cache_;
|
std::vector<std::shared_ptr<DMatrix> > cache_;
|
||||||
|
|
||||||
common::Monitor monitor_;
|
common::Monitor monitor_;
|
||||||
|
|
||||||
|
/*! \brief saved config keys used to restore failed worker */
|
||||||
|
std::set<std::string> saved_configs_ = {"max_depth", "tree_method", "dsplit",
|
||||||
|
"seed", "silent", "num_round", "gamma", "min_child_weight"};
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT
|
std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT
|
||||||
|
|||||||
12
tests/ci_build/approx.conf.in
Normal file
12
tests/ci_build/approx.conf.in
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# Originally an example in demo/regression/
|
||||||
|
tree_method=approx
|
||||||
|
eta = 0.5
|
||||||
|
gamma = 1.0
|
||||||
|
seed = 0
|
||||||
|
min_child_weight = 0
|
||||||
|
max_depth = 5
|
||||||
|
|
||||||
|
num_round = 12
|
||||||
|
save_period = 100
|
||||||
|
data = "demo/data/agaricus.txt.train"
|
||||||
|
eval[test] = "demo/data/agaricus.txt.test"
|
||||||
10
tests/ci_build/build_mock_cmake.sh
Executable file
10
tests/ci_build/build_mock_cmake.sh
Executable file
@ -0,0 +1,10 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
rm -rf build
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
|
||||||
|
make clean
|
||||||
|
make -j$(nproc)
|
||||||
|
cd ..
|
||||||
13
tests/ci_build/runxgb.sh
Executable file
13
tests/ci_build/runxgb.sh
Executable file
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#run make in rabit/test to generate librabit_mock
|
||||||
|
#update config.mk and build xgboost using mock
|
||||||
|
export DMLC_SUBMIT_CLUSTER=local
|
||||||
|
|
||||||
|
submit="python3 dmlc-core/tracker/dmlc-submit"
|
||||||
|
# build xgboost with librabit mock
|
||||||
|
# define max worker retry with dmlc-core local num atempt
|
||||||
|
# instrument worker failure with mock=xxxx
|
||||||
|
# check if host recovered from expectected iteration
|
||||||
|
echo "====== 1. Fault recovery distributed test ======"
|
||||||
|
exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1
|
||||||
Loading…
x
Reference in New Issue
Block a user