Set the appropriate device before freeing device memory... (#4566)

* - set the appropriate device before freeing device memory... - pr #4532 added a global memory tracker/logger to keep track of number of (de)allocations and peak memory usage on a per device basis. - this pr adds the appropriate check to make sure that the (de)allocation counts and memory usages makes sense for the device. since verbosity is typically increased on debug/non-retail builds. * - pre-create cub allocators and reuse them - create them once and not resize them dynamically. we need to ensure that these allocators are created and destroyed exactly once so that the appropriate device id's are set
2019-06-17 19:58:05 -07:00 · 2019-06-17 19:58:05 -07:00 · 90f683b25b
commit 90f683b25b
parent a22368d210
8 changed files with 72 additions and 18 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -237,11 +237,19 @@ class MemoryLogger {
      peak_allocated_bytes =
        std::max(peak_allocated_bytes, currently_allocated_bytes);
      num_allocations++;
+      CHECK_GT(num_allocations, num_deallocations);
+    }
+    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
+      auto itr = device_allocations.find(ptr);
+      if (itr == device_allocations.end()) {
+        LOG(FATAL) << "Attempting to deallocate " << n << " bytes on device "
+                   << current_device << " that was never allocated ";
      }
-    void RegisterDeallocation(void *ptr) {
      num_deallocations++;
-      currently_allocated_bytes -= device_allocations[ptr];
-      device_allocations.erase(ptr);
+      CHECK_LE(num_deallocations, num_allocations);
+      CHECK_EQ(itr->second, n);
+      currently_allocated_bytes -= itr->second;
+      device_allocations.erase(itr);
    }
  };
  std::map<int, DeviceStats>
@ -256,14 +264,15 @@ public:
    int current_device;
    safe_cuda(cudaGetDevice(&current_device));
    stats_[current_device].RegisterAllocation(ptr, n);
+    CHECK_LE(stats_[current_device].peak_allocated_bytes, dh::TotalMemory(current_device));
  }
-  void RegisterDeallocation(void *ptr) {
+  void RegisterDeallocation(void *ptr, size_t n) {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
      return;
    std::lock_guard<std::mutex> guard(mutex_);
    int current_device;
    safe_cuda(cudaGetDevice(&current_device));
-    stats_[current_device].RegisterDeallocation(ptr);
+    stats_[current_device].RegisterDeallocation(ptr, n, current_device);
  }
  void Log() {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
@ -299,7 +308,7 @@ struct XGBDefaultDeviceAllocator : thrust::device_malloc_allocator<T> {
    return ptr;
  }
  void deallocate(pointer ptr, size_t n) {
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get());
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n);
    return super_t::deallocate(ptr, n);
  }
 };
@ -543,6 +552,8 @@ struct CubMemory {
      XGBDeviceAllocator<uint8_t> allocator;
      allocator.deallocate(thrust::device_ptr<uint8_t>(static_cast<uint8_t *>(d_temp_storage)),
        temp_storage_bytes);
+      d_temp_storage = nullptr;
+      temp_storage_bytes = 0;
    }
  }

--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -150,6 +150,10 @@ struct GPUSketcher {
      n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) {
    }

+    ~DeviceShard() {
+      dh::safe_cuda(cudaSetDevice(device_));
+    }
+
    inline size_t GetRowStride() const {
       return row_stride_;
    }
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -49,6 +49,10 @@ struct HostDeviceVectorImpl {
      : proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
        cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}

+    ~DeviceShard() {
+      SetDevice();
+    }
+
    void Init(HostDeviceVectorImpl<T>* vec, int device) {
      if (vec_ == nullptr) { vec_ = vec; }
      CHECK_EQ(vec, vec_);
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@ -81,6 +81,10 @@ class DeviceShard {
    RescaleIndices(device_id_, ridx_begin_, data_);
  }

+  ~DeviceShard() {
+    dh::safe_cuda(cudaSetDevice(device_id_));
+  }
+
  bool IsEmpty() {
    return (ridx_end_ - ridx_begin_) == 0;
  }
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@ -58,6 +58,14 @@ class ElementWiseMetricsReduction {

 #if defined(XGBOOST_USE_CUDA)

+  ~ElementWiseMetricsReduction() {
+    for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
+      dh::safe_cuda(cudaSetDevice(id));
+      size_t index = devices_.Index(id);
+      allocators_.at(index).Free();
+    }
+  }
+
  PackedReduceResult DeviceReduceMetrics(
      GPUSet::GpuIdType device_id,
      size_t device_index,
@ -96,6 +104,7 @@ class ElementWiseMetricsReduction {
 #endif  // XGBOOST_USE_CUDA

  PackedReduceResult Reduce(
+      const LearnerTrainParam &tparam,
      GPUSet devices,
      const HostDeviceVector<bst_float>& weights,
      const HostDeviceVector<bst_float>& labels,
@ -107,9 +116,9 @@ class ElementWiseMetricsReduction {
    }
 #if defined(XGBOOST_USE_CUDA)
    else {  // NOLINT
-      if (allocators_.size() != devices.Size()) {
-        allocators_.clear();
-        allocators_.resize(devices.Size());
+      if (allocators_.empty()) {
+        devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
+        allocators_.resize(devices_.Size());
      }
      preds.Shard(devices);
      labels.Shard(devices);
@ -135,6 +144,7 @@ class ElementWiseMetricsReduction {
 private:
  EvalRow policy_;
 #if defined(XGBOOST_USE_CUDA)
+  GPUSet devices_;
  std::vector<dh::CubMemory> allocators_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };
@ -339,7 +349,7 @@ struct EvalEWiseBase : public Metric {
    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);

    auto result =
-        reducer_.Reduce(devices, info.weights_, info.labels_, preds);
+        reducer_.Reduce(*tparam_, devices, info.weights_, info.labels_, preds);

    double dat[2] { result.Residue(), result.Weights() };
    if (distributed) {
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@ -73,6 +73,14 @@ class MultiClassMetricsReduction {

 #if defined(XGBOOST_USE_CUDA)

+  ~MultiClassMetricsReduction() {
+    for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
+      dh::safe_cuda(cudaSetDevice(id));
+      size_t index = devices_.Index(id);
+      allocators_.at(index).Free();
+    }
+  }
+
  PackedReduceResult DeviceReduceMetrics(
      GPUSet::GpuIdType device_id,
      size_t device_index,
@ -118,6 +126,7 @@ class MultiClassMetricsReduction {
 #endif  // XGBOOST_USE_CUDA

  PackedReduceResult Reduce(
+      const LearnerTrainParam &tparam,
      GPUSet devices,
      size_t n_class,
      const HostDeviceVector<bst_float>& weights,
@ -130,9 +139,9 @@ class MultiClassMetricsReduction {
    }
 #if defined(XGBOOST_USE_CUDA)
    else {  // NOLINT
-      if (allocators_.size() != devices.Size()) {
-        allocators_.clear();
-        allocators_.resize(devices.Size());
+      if (allocators_.empty()) {
+        devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
+        allocators_.resize(devices_.Size());
      }
      preds.Shard(GPUDistribution::Granular(devices, n_class));
      labels.Shard(devices);
@ -158,6 +167,7 @@ class MultiClassMetricsReduction {
 private:
 #if defined(XGBOOST_USE_CUDA)
  dh::PinnedMemory label_error_;
+  GPUSet devices_;
  std::vector<dh::CubMemory> allocators_;
 #endif  // defined(XGBOOST_USE_CUDA)
 };
@ -181,7 +191,7 @@ struct EvalMClassBase : public Metric {
    const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());

    GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
-    auto result = reducer_.Reduce(devices, nclass, info.weights_, info.labels_, preds);
+    auto result = reducer_.Reduce(*tparam_, devices, nclass, info.weights_, info.labels_, preds);
    double dat[2] { result.Residue(), result.Weights() };

    if (distributed) {
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@ -19,6 +19,17 @@ void SetDevice(int device) {
  dh::safe_cuda(cudaSetDevice(device));
 }

+struct HostDeviceVectorSetDeviceHandler {
+  template <typename Functor>
+  explicit HostDeviceVectorSetDeviceHandler(Functor f) {
+    SetCudaSetDeviceHandler(f);
+  }
+
+  ~HostDeviceVectorSetDeviceHandler() {
+    SetCudaSetDeviceHandler(nullptr);
+  }
+};
+
 void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
                     HostDeviceVector<int> *v) {
  // create the vector
@ -107,7 +118,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
 void TestHostDeviceVector
 (size_t n, const GPUDistribution& distribution,
 const std::vector<size_t>& starts, const std::vector<size_t>& sizes) {
-  SetCudaSetDeviceHandler(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
  HostDeviceVector<int> v;
  InitHostDeviceVector(n, distribution, &v);
  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
@ -115,7 +126,6 @@ void TestHostDeviceVector
  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
  CheckHost(&v, GPUAccess::kRead);
  CheckHost(&v, GPUAccess::kWrite);
-  SetCudaSetDeviceHandler(nullptr);
 }

 TEST(HostDeviceVector, TestBlock) {
@ -161,7 +171,7 @@ TEST(HostDeviceVector, TestCopy) {
  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
  std::vector<size_t> starts{0, 501};
  std::vector<size_t> sizes{501, 500};
-  SetCudaSetDeviceHandler(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);

  HostDeviceVector<int> v;
  {
@ -175,7 +185,6 @@ TEST(HostDeviceVector, TestCopy) {
  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
  CheckHost(&v, GPUAccess::kRead);
  CheckHost(&v, GPUAccess::kWrite);
-  SetCudaSetDeviceHandler(nullptr);
 }

 TEST(HostDeviceVector, Shard) {
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@ -10,6 +10,7 @@
 #include "gtest/gtest.h"
 #include "../helpers.h"

+#if defined(XGBOOST_USE_NCCL)
 namespace {

 inline void CheckCAPICall(int ret) {
@ -17,6 +18,7 @@ inline void CheckCAPICall(int ret) {
 }

 }  // namespace anonymous
+#endif

 extern const std::map<std::string, std::string>&
 QueryBoosterConfigurationArguments(BoosterHandle handle);