[rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808)

* Better recovery support.  Restarting only the failed workers.
This commit is contained in:
Chen Qin
2019-09-16 20:31:52 -07:00
committed by Jiaming Yuan
parent c89bcc4de5
commit 512f037e55
11 changed files with 111 additions and 14 deletions

View File

@@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
/* sync up number of features after matrix loaded.
* partitioned data will fail the train/val validation check
* since partitioned data not knowing the real number of features. */
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
nullptr, fname.c_str());
// backward compatiblity code.
if (!load_row_split) {
MetaInfo& info = dmat->Info();