Overload device memory allocation (#4532)

* Group source files, include headers in source files * Overload device memory allocation
2019-06-10 11:35:13 +12:00
parent da21ac0cc2
commit 9683fd433e
9 changed files with 140 additions and 49 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -4,6 +4,7 @@
 #pragma once
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
+#include <thrust/device_malloc_allocator.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
 #include <xgboost/logging.h>
@@ -49,11 +50,6 @@ inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
 }
 #endif

-template <typename T>
-T *Raw(thrust::device_vector<T> &v) {  //  NOLINT
-  return raw_pointer_cast(v.data());
-}
-
 inline void CudaCheckPointerDevice(void* ptr) {
  cudaPointerAttributes attr;
  dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
@@ -225,6 +221,97 @@ inline void LaunchN(int device_idx, size_t n, L lambda) {
  LaunchN<ITEMS_PER_THREAD, BLOCK_THREADS>(device_idx, n, nullptr, lambda);
 }

+namespace detail {
+/** \brief Keeps track of global device memory allocations. Thread safe.*/
+class MemoryLogger {
+  // Information for a single device
+  struct DeviceStats {
+    size_t currently_allocated_bytes{ 0 };
+    size_t peak_allocated_bytes{ 0 };
+    size_t num_allocations{ 0 };
+    size_t num_deallocations{ 0 };
+    std::map<void *, size_t> device_allocations;
+    void RegisterAllocation(void *ptr, size_t n) {
+      device_allocations[ptr] = n;
+      currently_allocated_bytes += n;
+      peak_allocated_bytes =
+        std::max(peak_allocated_bytes, currently_allocated_bytes);
+      num_allocations++;
+    }
+    void RegisterDeallocation(void *ptr) {
+      num_deallocations++;
+      currently_allocated_bytes -= device_allocations[ptr];
+      device_allocations.erase(ptr);
+    }
+  };
+  std::map<int, DeviceStats>
+    stats_;  // Map device ordinal to memory information
+  std::mutex mutex_;
+
+public:
+  void RegisterAllocation(void *ptr, size_t n) {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
+      return;
+    std::lock_guard<std::mutex> guard(mutex_);
+    int current_device;
+    safe_cuda(cudaGetDevice(&current_device));
+    stats_[current_device].RegisterAllocation(ptr, n);
+  }
+  void RegisterDeallocation(void *ptr) {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
+      return;
+    std::lock_guard<std::mutex> guard(mutex_);
+    int current_device;
+    safe_cuda(cudaGetDevice(&current_device));
+    stats_[current_device].RegisterDeallocation(ptr);
+  }
+  void Log() {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
+      return;
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (const auto &kv : stats_) {
+      LOG(CONSOLE) << "======== Device " << kv.first << " Memory Allocations: "
+        << " ========";
+      LOG(CONSOLE) << "Peak memory usage: "
+        << kv.second.peak_allocated_bytes / 1000000 << "mb";
+      LOG(CONSOLE) << "Number of allocations: " << kv.second.num_allocations;
+    }
+  }
+};
+};
+
+inline detail::MemoryLogger &GlobalMemoryLogger() {
+  static detail::MemoryLogger memory_logger;
+  return memory_logger;
+}
+
+namespace detail{
+/**
+ * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
+ */
+template <class T>
+struct XGBDefaultDeviceAllocator : thrust::device_malloc_allocator<T> {
+  using super_t = thrust::device_malloc_allocator<T>;
+  using pointer = thrust::device_ptr<T>;
+  pointer allocate(size_t n) {
+    pointer ptr = super_t::allocate(n);
+    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n);
+    return ptr;
+  }
+  void deallocate(pointer ptr, size_t n) {
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get());
+    return super_t::deallocate(ptr, n);
+  }
+};
+};
+
+// Declare xgboost allocator
+// Replacement of allocator with custom backend should occur here
+template <typename T>
+using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocator<T>;
+/** \brief Specialisation of thrust device vector using custom allocator. */
+template <typename T>
+using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;

 /**
 * \brief A double buffer, useful for algorithms like sort.
@@ -335,10 +422,9 @@ class BulkAllocator {
  }

  char *AllocateDevice(int device_idx, size_t bytes) {
-    char *ptr;
    safe_cuda(cudaSetDevice(device_idx));
-    safe_cuda(cudaMalloc(&ptr, bytes));
-    return ptr;
+    XGBDeviceAllocator<char> allocator;
+    return allocator.allocate(bytes).get();
  }

  template <typename T>
@@ -383,7 +469,8 @@ class BulkAllocator {
    for (size_t i = 0; i < d_ptr_.size(); i++) {
      if (!(d_ptr_[i] == nullptr)) {
        safe_cuda(cudaSetDevice(device_idx_[i]));
-        safe_cuda(cudaFree(d_ptr_[i]));
+        XGBDeviceAllocator<char> allocator;
+        allocator.deallocate(thrust::device_ptr<char>(d_ptr_[i]), size_[i]);
        d_ptr_[i] = nullptr;
      }
    }
@@ -453,14 +540,17 @@ struct CubMemory {

  void Free() {
    if (this->IsAllocated()) {
-      safe_cuda(cudaFree(d_temp_storage));
+      XGBDeviceAllocator<uint8_t> allocator;
+      allocator.deallocate(thrust::device_ptr<uint8_t>(static_cast<uint8_t *>(d_temp_storage)),
+        temp_storage_bytes);
    }
  }

  void LazyAllocate(size_t num_bytes) {
    if (num_bytes > temp_storage_bytes) {
      Free();
-      safe_cuda(cudaMalloc(&d_temp_storage, num_bytes));
+      XGBDeviceAllocator<uint8_t> allocator;
+      d_temp_storage = static_cast<void *>(allocator.allocate(num_bytes).get());
      temp_storage_bytes = num_bytes;
    }
  }
@@ -1119,7 +1209,7 @@ ReduceT ReduceShards(std::vector<ShardT> *shards, FunctionT f) {
 template <typename T,
  typename IndexT = typename xgboost::common::Span<T>::index_type>
 xgboost::common::Span<T> ToSpan(
-    thrust::device_vector<T>& vec,
+    device_vector<T>& vec,
    IndexT offset = 0,
    IndexT size = -1) {
  size = size == -1 ? vec.size() : size;
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -130,18 +130,18 @@ struct GPUSketcher {

    tree::TrainParam param_;
    SketchContainer *sketch_container_;
-    thrust::device_vector<size_t> row_ptrs_;
-    thrust::device_vector<Entry> entries_;
-    thrust::device_vector<bst_float> fvalues_;
-    thrust::device_vector<bst_float> feature_weights_;
-    thrust::device_vector<bst_float> fvalues_cur_;
-    thrust::device_vector<WXQSketch::Entry> cuts_d_;
+    dh::device_vector<size_t> row_ptrs_;
+    dh::device_vector<Entry> entries_;
+    dh::device_vector<bst_float> fvalues_;
+    dh::device_vector<bst_float> feature_weights_;
+    dh::device_vector<bst_float> fvalues_cur_;
+    dh::device_vector<WXQSketch::Entry> cuts_d_;
    thrust::host_vector<WXQSketch::Entry> cuts_h_;
-    thrust::device_vector<bst_float> weights_;
-    thrust::device_vector<bst_float> weights2_;
+    dh::device_vector<bst_float> weights_;
+    dh::device_vector<bst_float> weights2_;
    std::vector<size_t> n_cuts_cur_;
-    thrust::device_vector<size_t> num_elements_;
-    thrust::device_vector<char> tmp_storage_;
+    dh::device_vector<size_t> num_elements_;
+    dh::device_vector<char> tmp_storage_;

   public:
    DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -161,7 +161,7 @@ struct HostDeviceVectorImpl {

   private:
    int device_;
-    thrust::device_vector<T> data_;
+    dh::device_vector<T> data_;
    // cached vector size
    size_t cached_size_;
    size_t start_;