From 9600ca83f3d69dd2708aee9a78d183868057779f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 11 Aug 2021 16:33:53 +0800 Subject: [PATCH] Remove synchronization in monitor. (#7164) * Remove synchronization in monitor. Calling rabit functions during destruction is flaky. * Add xgboost prefix to nvtx marker. --- src/common/timer.cc | 89 +++++---------------------------------------- src/common/timer.h | 2 - 2 files changed, 10 insertions(+), 81 deletions(-) diff --git a/src/common/timer.cc b/src/common/timer.cc index 79c823dd7..d711446d3 100644 --- a/src/common/timer.cc +++ b/src/common/timer.cc @@ -8,7 +8,6 @@ #include #include #include "timer.h" -#include "xgboost/json.h" #if defined(XGBOOST_USE_NVTX) #include @@ -22,7 +21,7 @@ void Monitor::Start(std::string const &name) { auto &stats = statistics_map_[name]; stats.timer.Start(); #if defined(XGBOOST_USE_NVTX) - std::string nvtx_name = label_ + "::" + name; + std::string nvtx_name = "xgboost::" + label_ + "::" + name; stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str()); #endif // defined(XGBOOST_USE_NVTX) } @@ -39,60 +38,6 @@ void Monitor::Stop(const std::string &name) { } } -std::vector Monitor::CollectFromOtherRanks() const { - // Since other nodes might have started timers that this one haven't, so - // we can't simply call all reduce. - size_t const world_size = rabit::GetWorldSize(); - size_t const rank = rabit::GetRank(); - - // It's much easier to work with rabit if we have a string serialization. So we go with - // json. - Json j_statistic { Object() }; - j_statistic["rank"] = Integer(rank); - j_statistic["statistic"] = Object(); - - auto& statistic = j_statistic["statistic"]; - for (auto const& kv : statistics_map_) { - statistic[kv.first] = Object(); - auto& j_pair = statistic[kv.first]; - j_pair["count"] = Integer(kv.second.count); - j_pair["elapsed"] = Integer(static_cast( - std::chrono::duration_cast( - kv.second.timer.elapsed).count())); - } - - std::string str; - Json::Dump(j_statistic, &str); - - size_t str_size = str.size(); - rabit::Allreduce(&str_size, 1); - std::string buffer; - buffer.resize(str_size); - - // vector storing stat from all workers - std::vector world(world_size); - - // Actually only rank 0 is printing. - for (size_t i = 0; i < world_size; ++i) { - std::copy(str.cbegin(), str.cend(), buffer.begin()); - rabit::Broadcast(&buffer, i); - auto j_other = Json::Load(StringView{buffer.c_str(), buffer.size()}); - auto& other = world[i]; - - auto const& j_statistic = get(j_other["statistic"]); - - for (auto const& kv : j_statistic) { - std::string const& timer_name = kv.first; - auto const& pair = kv.second; - other[timer_name] = {get(pair["count"]), get(pair["elapsed"])}; - } - - // FIXME(trivialfis): How to ask rabit to block here? - } - - return world; -} - void Monitor::PrintStatistics(StatMap const& statistics) const { for (auto &kv : statistics) { if (kv.second.first == 0) { @@ -109,30 +54,16 @@ void Monitor::PrintStatistics(StatMap const& statistics) const { void Monitor::Print() const { if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; } - - bool is_distributed = rabit::IsDistributed(); - - if (is_distributed) { - auto world = this->CollectFromOtherRanks(); - // rank zero is in charge of printing - if (rabit::GetRank() == 0) { - LOG(CONSOLE) << "======== Monitor: " << label_ << " ========"; - for (size_t i = 0; i < world.size(); ++i) { - LOG(CONSOLE) << "From rank: " << i << ": " << std::endl; - auto const& statistic = world[i]; - this->PrintStatistics(statistic); - } - } - } else { - StatMap stat_map; - for (auto const& kv : statistics_map_) { - stat_map[kv.first] = std::make_pair( - kv.second.count, std::chrono::duration_cast( - kv.second.timer.elapsed).count()); - } - LOG(CONSOLE) << "======== Monitor: " << label_ << " ========"; - this->PrintStatistics(stat_map); + auto rank = rabit::GetRank(); + StatMap stat_map; + for (auto const &kv : statistics_map_) { + stat_map[kv.first] = std::make_pair( + kv.second.count, std::chrono::duration_cast( + kv.second.timer.elapsed) + .count()); } + LOG(CONSOLE) << "======== Monitor (" << rank << "): " << label_ << " ========"; + this->PrintStatistics(stat_map); } } // namespace common diff --git a/src/common/timer.h b/src/common/timer.h index b2dc2a428..3daaeda8c 100644 --- a/src/common/timer.h +++ b/src/common/timer.h @@ -59,8 +59,6 @@ struct Monitor { std::map statistics_map_; Timer self_timer_; - /*! \brief Collect time statistics across all workers. */ - std::vector CollectFromOtherRanks() const; void PrintStatistics(StatMap const& statistics) const; public: