Use nccl group calls to prevent from dead lock. (#4113)

* launch all reduce sequentially.
* Fix gpu_exact test memory leak.
This commit is contained in:
Jiaming Yuan
2019-02-08 06:12:39 +08:00
committed by GitHub
parent 05243642bb
commit f8ca2960fc
4 changed files with 18 additions and 15 deletions

View File

@@ -1050,19 +1050,20 @@ class GPUHistMakerSpecialised{
void AllReduceHist(int nidx) {
if (shards_.size() == 1) return;
monitor_.Start("AllReduce");
dh::ExecuteIndexShards(
&shards_,
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
reducer_.AllReduceSum(
dist_.Devices().Index(shard->device_id_),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
n_bins_ * (sizeof(GradientSumT) /
sizeof(typename GradientSumT::ValueT)));
});
reducer_.GroupStart();
for (auto& shard : shards_) {
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
reducer_.AllReduceSum(
dist_.Devices().Index(shard->device_id_),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
}
reducer_.GroupEnd();
reducer_.Synchronize();
monitor_.Stop("AllReduce");
}