Use nccl group calls to prevent from dead lock. (#4113)

* launch all reduce sequentially.
* Fix gpu_exact test memory leak.
This commit is contained in:
Jiaming Yuan 2019-02-08 06:12:39 +08:00 committed by GitHub
parent 05243642bb
commit f8ca2960fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 18 additions and 15 deletions

View File

@ -852,7 +852,7 @@ class AllReducer {
#ifdef XGBOOST_USE_NCCL #ifdef XGBOOST_USE_NCCL
std::vector<ncclComm_t> comms; std::vector<ncclComm_t> comms;
std::vector<cudaStream_t> streams; std::vector<cudaStream_t> streams;
std::vector<int> device_ordinals; std::vector<int> device_ordinals; // device id from CUDA
#endif #endif
public: public:

View File

@ -459,9 +459,8 @@ HostDeviceVector<T>& HostDeviceVector<T>::operator=
template <typename T> template <typename T>
HostDeviceVector<T>::~HostDeviceVector() { HostDeviceVector<T>::~HostDeviceVector() {
HostDeviceVectorImpl<T>* tmp = impl_; delete impl_;
impl_ = nullptr; impl_ = nullptr;
delete tmp;
} }
template <typename T> template <typename T>

View File

@ -1050,19 +1050,20 @@ class GPUHistMakerSpecialised{
void AllReduceHist(int nidx) { void AllReduceHist(int nidx) {
if (shards_.size() == 1) return; if (shards_.size() == 1) return;
monitor_.Start("AllReduce"); monitor_.Start("AllReduce");
dh::ExecuteIndexShards(
&shards_, reducer_.GroupStart();
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) { for (auto& shard : shards_) {
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data(); auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
reducer_.AllReduceSum( reducer_.AllReduceSum(
dist_.Devices().Index(shard->device_id_), dist_.Devices().Index(shard->device_id_),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist), reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist), reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
n_bins_ * (sizeof(GradientSumT) / n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
sizeof(typename GradientSumT::ValueT))); }
}); reducer_.GroupEnd();
reducer_.Synchronize();
monitor_.Stop("AllReduce"); monitor_.Stop("AllReduce");
} }

View File

@ -42,6 +42,9 @@ TEST(GPUExact, Update) {
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps); ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps); ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
delete dmat;
delete p_gpuexact_maker;
} }
} // namespace tree } // namespace tree