Use nccl group calls to prevent from dead lock. (#4113)

* launch all reduce sequentially. * Fix gpu_exact test memory leak.
2019-02-08 06:12:39 +08:00
parent 05243642bb
commit f8ca2960fc
4 changed files with 18 additions and 15 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -852,7 +852,7 @@ class AllReducer {
 #ifdef XGBOOST_USE_NCCL
  std::vector<ncclComm_t> comms;
  std::vector<cudaStream_t> streams;
-  std::vector<int> device_ordinals;
+  std::vector<int> device_ordinals;  // device id from CUDA
 #endif

 public:
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -459,9 +459,8 @@ HostDeviceVector<T>& HostDeviceVector<T>::operator=

 template <typename T>
 HostDeviceVector<T>::~HostDeviceVector() {
-  HostDeviceVectorImpl<T>* tmp = impl_;
+  delete impl_;
  impl_ = nullptr;
-  delete tmp;
 }

 template <typename T>