[rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808)
* Better recovery support. Restarting only the failed workers.
This commit is contained in:
12
tests/ci_build/approx.conf.in
Normal file
12
tests/ci_build/approx.conf.in
Normal file
@@ -0,0 +1,12 @@
|
||||
# Originally an example in demo/regression/
|
||||
tree_method=approx
|
||||
eta = 0.5
|
||||
gamma = 1.0
|
||||
seed = 0
|
||||
min_child_weight = 0
|
||||
max_depth = 5
|
||||
|
||||
num_round = 12
|
||||
save_period = 100
|
||||
data = "demo/data/agaricus.txt.train"
|
||||
eval[test] = "demo/data/agaricus.txt.test"
|
||||
10
tests/ci_build/build_mock_cmake.sh
Executable file
10
tests/ci_build/build_mock_cmake.sh
Executable file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
rm -rf build
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
|
||||
make clean
|
||||
make -j$(nproc)
|
||||
cd ..
|
||||
13
tests/ci_build/runxgb.sh
Executable file
13
tests/ci_build/runxgb.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
#run make in rabit/test to generate librabit_mock
|
||||
#update config.mk and build xgboost using mock
|
||||
export DMLC_SUBMIT_CLUSTER=local
|
||||
|
||||
submit="python3 dmlc-core/tracker/dmlc-submit"
|
||||
# build xgboost with librabit mock
|
||||
# define max worker retry with dmlc-core local num atempt
|
||||
# instrument worker failure with mock=xxxx
|
||||
# check if host recovered from expectected iteration
|
||||
echo "====== 1. Fault recovery distributed test ======"
|
||||
exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1
|
||||
Reference in New Issue
Block a user