Use nccl group calls to prevent from dead lock. (#4113)

* launch all reduce sequentially. * Fix gpu_exact test memory leak.
2019-02-08 06:12:39 +08:00 · 2019-02-08 06:12:39 +08:00 · f8ca2960fc
commit f8ca2960fc
parent 05243642bb
4 changed files with 18 additions and 15 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -852,7 +852,7 @@ class AllReducer {
 #ifdef XGBOOST_USE_NCCL
  std::vector<ncclComm_t> comms;
  std::vector<cudaStream_t> streams;
-  std::vector<int> device_ordinals;
+  std::vector<int> device_ordinals;  // device id from CUDA
 #endif
 public:
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -459,9 +459,8 @@ HostDeviceVector<T>& HostDeviceVector<T>::operator=
 template <typename T>
 HostDeviceVector<T>::~HostDeviceVector() {
-  HostDeviceVectorImpl<T>* tmp = impl_;
+  delete impl_;
  impl_ = nullptr;
  delete tmp;
 }
 template <typename T>
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -1050,19 +1050,20 @@ class GPUHistMakerSpecialised{
  void AllReduceHist(int nidx) {
    if (shards_.size() == 1) return;
    monitor_.Start("AllReduce");
-    dh::ExecuteIndexShards(
+
-        &shards_,
+    reducer_.GroupStart();
-        [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
+    for (auto& shard : shards_) {
-          auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
+      auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
-          reducer_.AllReduceSum(
+      reducer_.AllReduceSum(
-              dist_.Devices().Index(shard->device_id_),
+          dist_.Devices().Index(shard->device_id_),
-              reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+          reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-              reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+          reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-              n_bins_ * (sizeof(GradientSumT) /
+          n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
-                         sizeof(typename GradientSumT::ValueT)));
+    }
-        });
+    reducer_.GroupEnd();
    reducer_.Synchronize();
    monitor_.Stop("AllReduce");
  }
--- a/tests/cpp/tree/test_gpu_exact.cu
+++ b/tests/cpp/tree/test_gpu_exact.cu
@ -42,6 +42,9 @@ TEST(GPUExact, Update) {
  ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
  ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
  delete dmat;
  delete p_gpuexact_maker;
 }
 }  // namespace tree