Remove synchronization in monitor. (#7164)
* Remove synchronization in monitor. Calling rabit functions during destruction is flaky. * Add xgboost prefix to nvtx marker.
This commit is contained in:
parent
149f209af6
commit
9600ca83f3
@ -8,7 +8,6 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include "timer.h"
|
#include "timer.h"
|
||||||
#include "xgboost/json.h"
|
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_NVTX)
|
#if defined(XGBOOST_USE_NVTX)
|
||||||
#include <nvToolsExt.h>
|
#include <nvToolsExt.h>
|
||||||
@ -22,7 +21,7 @@ void Monitor::Start(std::string const &name) {
|
|||||||
auto &stats = statistics_map_[name];
|
auto &stats = statistics_map_[name];
|
||||||
stats.timer.Start();
|
stats.timer.Start();
|
||||||
#if defined(XGBOOST_USE_NVTX)
|
#if defined(XGBOOST_USE_NVTX)
|
||||||
std::string nvtx_name = label_ + "::" + name;
|
std::string nvtx_name = "xgboost::" + label_ + "::" + name;
|
||||||
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
|
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
|
||||||
#endif // defined(XGBOOST_USE_NVTX)
|
#endif // defined(XGBOOST_USE_NVTX)
|
||||||
}
|
}
|
||||||
@ -39,60 +38,6 @@ void Monitor::Stop(const std::string &name) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Monitor::StatMap> Monitor::CollectFromOtherRanks() const {
|
|
||||||
// Since other nodes might have started timers that this one haven't, so
|
|
||||||
// we can't simply call all reduce.
|
|
||||||
size_t const world_size = rabit::GetWorldSize();
|
|
||||||
size_t const rank = rabit::GetRank();
|
|
||||||
|
|
||||||
// It's much easier to work with rabit if we have a string serialization. So we go with
|
|
||||||
// json.
|
|
||||||
Json j_statistic { Object() };
|
|
||||||
j_statistic["rank"] = Integer(rank);
|
|
||||||
j_statistic["statistic"] = Object();
|
|
||||||
|
|
||||||
auto& statistic = j_statistic["statistic"];
|
|
||||||
for (auto const& kv : statistics_map_) {
|
|
||||||
statistic[kv.first] = Object();
|
|
||||||
auto& j_pair = statistic[kv.first];
|
|
||||||
j_pair["count"] = Integer(kv.second.count);
|
|
||||||
j_pair["elapsed"] = Integer(static_cast<int64_t>(
|
|
||||||
std::chrono::duration_cast<std::chrono::microseconds>(
|
|
||||||
kv.second.timer.elapsed).count()));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string str;
|
|
||||||
Json::Dump(j_statistic, &str);
|
|
||||||
|
|
||||||
size_t str_size = str.size();
|
|
||||||
rabit::Allreduce<rabit::op::Max>(&str_size, 1);
|
|
||||||
std::string buffer;
|
|
||||||
buffer.resize(str_size);
|
|
||||||
|
|
||||||
// vector storing stat from all workers
|
|
||||||
std::vector<StatMap> world(world_size);
|
|
||||||
|
|
||||||
// Actually only rank 0 is printing.
|
|
||||||
for (size_t i = 0; i < world_size; ++i) {
|
|
||||||
std::copy(str.cbegin(), str.cend(), buffer.begin());
|
|
||||||
rabit::Broadcast(&buffer, i);
|
|
||||||
auto j_other = Json::Load(StringView{buffer.c_str(), buffer.size()});
|
|
||||||
auto& other = world[i];
|
|
||||||
|
|
||||||
auto const& j_statistic = get<Object>(j_other["statistic"]);
|
|
||||||
|
|
||||||
for (auto const& kv : j_statistic) {
|
|
||||||
std::string const& timer_name = kv.first;
|
|
||||||
auto const& pair = kv.second;
|
|
||||||
other[timer_name] = {get<Integer>(pair["count"]), get<Integer>(pair["elapsed"])};
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME(trivialfis): How to ask rabit to block here?
|
|
||||||
}
|
|
||||||
|
|
||||||
return world;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Monitor::PrintStatistics(StatMap const& statistics) const {
|
void Monitor::PrintStatistics(StatMap const& statistics) const {
|
||||||
for (auto &kv : statistics) {
|
for (auto &kv : statistics) {
|
||||||
if (kv.second.first == 0) {
|
if (kv.second.first == 0) {
|
||||||
@ -109,30 +54,16 @@ void Monitor::PrintStatistics(StatMap const& statistics) const {
|
|||||||
|
|
||||||
void Monitor::Print() const {
|
void Monitor::Print() const {
|
||||||
if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; }
|
if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; }
|
||||||
|
auto rank = rabit::GetRank();
|
||||||
bool is_distributed = rabit::IsDistributed();
|
StatMap stat_map;
|
||||||
|
for (auto const &kv : statistics_map_) {
|
||||||
if (is_distributed) {
|
stat_map[kv.first] = std::make_pair(
|
||||||
auto world = this->CollectFromOtherRanks();
|
kv.second.count, std::chrono::duration_cast<std::chrono::microseconds>(
|
||||||
// rank zero is in charge of printing
|
kv.second.timer.elapsed)
|
||||||
if (rabit::GetRank() == 0) {
|
.count());
|
||||||
LOG(CONSOLE) << "======== Monitor: " << label_ << " ========";
|
|
||||||
for (size_t i = 0; i < world.size(); ++i) {
|
|
||||||
LOG(CONSOLE) << "From rank: " << i << ": " << std::endl;
|
|
||||||
auto const& statistic = world[i];
|
|
||||||
this->PrintStatistics(statistic);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
StatMap stat_map;
|
|
||||||
for (auto const& kv : statistics_map_) {
|
|
||||||
stat_map[kv.first] = std::make_pair(
|
|
||||||
kv.second.count, std::chrono::duration_cast<std::chrono::microseconds>(
|
|
||||||
kv.second.timer.elapsed).count());
|
|
||||||
}
|
|
||||||
LOG(CONSOLE) << "======== Monitor: " << label_ << " ========";
|
|
||||||
this->PrintStatistics(stat_map);
|
|
||||||
}
|
}
|
||||||
|
LOG(CONSOLE) << "======== Monitor (" << rank << "): " << label_ << " ========";
|
||||||
|
this->PrintStatistics(stat_map);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace common
|
} // namespace common
|
||||||
|
|||||||
@ -59,8 +59,6 @@ struct Monitor {
|
|||||||
std::map<std::string, Statistics> statistics_map_;
|
std::map<std::string, Statistics> statistics_map_;
|
||||||
Timer self_timer_;
|
Timer self_timer_;
|
||||||
|
|
||||||
/*! \brief Collect time statistics across all workers. */
|
|
||||||
std::vector<StatMap> CollectFromOtherRanks() const;
|
|
||||||
void PrintStatistics(StatMap const& statistics) const;
|
void PrintStatistics(StatMap const& statistics) const;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user