Use nccl group calls to prevent from dead lock. (#4113)

* launch all reduce sequentially.
* Fix gpu_exact test memory leak.
This commit is contained in:
Jiaming Yuan 2019-02-08 06:12:39 +08:00 committed by GitHub
parent 05243642bb
commit f8ca2960fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 18 additions and 15 deletions

View File

@ -852,7 +852,7 @@ class AllReducer {
#ifdef XGBOOST_USE_NCCL
std::vector<ncclComm_t> comms;
std::vector<cudaStream_t> streams;
std::vector<int> device_ordinals;
std::vector<int> device_ordinals; // device id from CUDA
#endif
public:

View File

@ -459,9 +459,8 @@ HostDeviceVector<T>& HostDeviceVector<T>::operator=
template <typename T>
HostDeviceVector<T>::~HostDeviceVector() {
HostDeviceVectorImpl<T>* tmp = impl_;
delete impl_;
impl_ = nullptr;
delete tmp;
}
template <typename T>

View File

@ -1050,19 +1050,20 @@ class GPUHistMakerSpecialised{
void AllReduceHist(int nidx) {
if (shards_.size() == 1) return;
monitor_.Start("AllReduce");
dh::ExecuteIndexShards(
&shards_,
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
reducer_.AllReduceSum(
dist_.Devices().Index(shard->device_id_),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
n_bins_ * (sizeof(GradientSumT) /
sizeof(typename GradientSumT::ValueT)));
});
reducer_.GroupStart();
for (auto& shard : shards_) {
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
reducer_.AllReduceSum(
dist_.Devices().Index(shard->device_id_),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
}
reducer_.GroupEnd();
reducer_.Synchronize();
monitor_.Stop("AllReduce");
}

View File

@ -42,6 +42,9 @@ TEST(GPUExact, Update) {
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
delete dmat;
delete p_gpuexact_maker;
}
} // namespace tree