[rabit_bootstrap_cache ] failed xgb worker recover from other workers (#4808)
* Better recovery support. Restarting only the failed workers.
This commit is contained in:
@@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
|
||||
/* sync up number of features after matrix loaded.
|
||||
* partitioned data will fail the train/val validation check
|
||||
* since partitioned data not knowing the real number of features. */
|
||||
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
|
||||
rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
|
||||
nullptr, fname.c_str());
|
||||
// backward compatiblity code.
|
||||
if (!load_row_split) {
|
||||
MetaInfo& info = dmat->Info();
|
||||
|
||||
Reference in New Issue
Block a user