Retire DVec class in favour of c++20 style span for device memory. (#4293)

2019-03-28 13:59:58 +13:00
parent c85181dd8a
commit 3f312e30db
7 changed files with 288 additions and 369 deletions
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -227,179 +227,79 @@ inline void LaunchN(int device_idx, size_t n, L lambda) {
  LaunchN<ITEMS_PER_THREAD, BLOCK_THREADS>(device_idx, n, nullptr, lambda);
 }

-/*
- * Memory
+
+/**
+ * \brief A double buffer, useful for algorithms like sort.
 */
-
-enum MemoryType { kDevice, kDeviceManaged };
-
-template <MemoryType MemoryT>
-class BulkAllocator;
 template <typename T>
-class DVec2;
-
-template <typename T>
-class DVec {
-  friend class DVec2<T>;
-
- private:
-  T *ptr_;
-  size_t size_;
-  int device_idx_;
-
+class DoubleBuffer {
 public:
-  void ExternalAllocate(int device_idx, void *ptr, size_t size) {
-    if (!Empty()) {
-      throw std::runtime_error("Tried to allocate DVec but already allocated");
-    }
-    ptr_ = static_cast<T *>(ptr);
-    size_ = size;
-    device_idx_ = device_idx;
-    safe_cuda(cudaSetDevice(device_idx_));
+  cub::DoubleBuffer<T> buff;
+  xgboost::common::Span<T> a, b;
+  DoubleBuffer() = default;
+
+  size_t Size() const {
+    CHECK_EQ(a.size(), b.size());
+    return a.size();
+  }
+  cub::DoubleBuffer<T> &CubBuffer() { return buff; }
+
+  T *Current() { return buff.Current(); }
+  xgboost::common::Span<T> CurrentSpan() {
+    return xgboost::common::Span<T>{
+        buff.Current(),
+        static_cast<typename xgboost::common::Span<T>::index_type>(Size())};
  }

-  DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
-  size_t Size() const { return size_; }
-  int DeviceIdx() const { return device_idx_; }
-  bool Empty() const { return ptr_ == NULL || size_ == 0; }
-
-  T *Data() { return ptr_; }
-
-  const T *Data() const { return ptr_; }
-
-  xgboost::common::Span<const T> GetSpan() const {
-    return xgboost::common::Span<const T>(ptr_, this->Size());
-  }
-
-  xgboost::common::Span<T> GetSpan() {
-    return xgboost::common::Span<T>(ptr_, this->Size());
-  }
-
-  std::vector<T> AsVector() const {
-    std::vector<T> h_vector(Size());
-    safe_cuda(cudaSetDevice(device_idx_));
-    safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
-                         cudaMemcpyDeviceToHost));
-    return h_vector;
-  }
-
-  void Fill(T value) {
-    auto d_ptr = ptr_;
-    LaunchN(device_idx_, Size(),
-             [=] __device__(size_t idx) { d_ptr[idx] = value; });
-  }
-
-  void Print() {
-    auto h_vector = this->AsVector();
-    for (auto e : h_vector) {
-      std::cout << e << " ";
-    }
-    std::cout << "\n";
-  }
-
-  thrust::device_ptr<T> tbegin() { return thrust::device_pointer_cast(ptr_); }
-
-  thrust::device_ptr<T> tend() {
-    return thrust::device_pointer_cast(ptr_ + Size());
-  }
-
-  template <typename T2>
-  DVec &operator=(const std::vector<T2> &other) {
-    this->copy(other.begin(), other.end());
-    return *this;
-  }
-
-  DVec &operator=(DVec<T> &other) {
-    if (other.Size() != Size()) {
-      throw std::runtime_error(
-          "Cannot copy assign DVec to DVec, sizes are different");
-    }
-    safe_cuda(cudaSetDevice(this->DeviceIdx()));
-    if (other.DeviceIdx() == this->DeviceIdx()) {
-      dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
-                               other.Size() * sizeof(T),
-                               cudaMemcpyDeviceToDevice));
-    } else {
-      std::cout << "deviceother: " << other.DeviceIdx()
-                << " devicethis: " << this->DeviceIdx() << std::endl;
-      std::cout << "size deviceother: " << other.Size()
-                << " devicethis: " << this->DeviceIdx() << std::endl;
-      throw std::runtime_error("Cannot copy to/from different devices");
-    }
-
-    return *this;
-  }
-
-  template <typename IterT>
-  void copy(IterT begin, IterT end) {
-    safe_cuda(cudaSetDevice(this->DeviceIdx()));
-    if (end - begin != Size()) {
-      LOG(FATAL) << "Cannot copy assign vector to DVec, sizes are different" <<
-        " vector::Size(): " << end - begin << " DVec::Size(): " << Size();
-    }
-    thrust::copy(begin, end, this->tbegin());
-  }
-
-  void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
-    safe_cuda(cudaSetDevice(this->DeviceIdx()));
-    if (end - begin != Size()) {
-      throw std::runtime_error(
-          "Cannot copy assign vector to dvec, sizes are different");
-    }
-    safe_cuda(cudaMemcpyAsync(this->Data(), begin.get(), Size() * sizeof(T),
-                         cudaMemcpyDefault));
-  }
+  T *other() { return buff.Alternate(); }
 };

 /**
- * @class DVec2 device_helpers.cuh
- * @brief wrapper for storing 2 DVec's which are needed for cub::DoubleBuffer
+ * \brief Copies device span to std::vector.
+ *
+ * \tparam  T Generic type parameter.
+ * \param [in,out]  dst Copy destination.
+ * \param           src Copy source. Must be device memory.
 */
 template <typename T>
-class DVec2 {
- private:
-  DVec<T> d1_, d2_;
-  cub::DoubleBuffer<T> buff_;
-  int device_idx_;
+void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<T> src) {
+  CHECK_EQ(dst->size(), src.size());
+  dh::safe_cuda(cudaMemcpyAsync(dst->data(), src.data(), dst->size() * sizeof(T),
+                                cudaMemcpyDeviceToHost));
+}

- public:
-  void ExternalAllocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
-    if (!Empty()) {
-      throw std::runtime_error("Tried to allocate DVec2 but already allocated");
-    }
-    device_idx_ = device_idx;
-    d1_.ExternalAllocate(device_idx_, ptr1, size);
-    d2_.ExternalAllocate(device_idx_, ptr2, size);
-    buff_.d_buffers[0] = static_cast<T *>(ptr1);
-    buff_.d_buffers[1] = static_cast<T *>(ptr2);
-    buff_.selector = 0;
-  }
-  DVec2() : d1_(), d2_(), buff_(), device_idx_(-1) {}
+/**
+ * \brief Copies std::vector to device span.
+ *
+ * \tparam  T Generic type parameter.
+ * \param dst Copy destination. Must be device memory.
+ * \param src Copy source.
+ */
+template <typename T>
+void CopyVectorToDeviceSpan(xgboost::common::Span<T> dst ,const std::vector<T>&src)
+{
+  CHECK_EQ(dst.size(), src.size());
+  dh::safe_cuda(cudaMemcpyAsync(dst.data(), src.data(), dst.size() * sizeof(T),
+                                cudaMemcpyHostToDevice));
+}

-  size_t Size() const { return d1_.Size(); }
-  int DeviceIdx() const { return device_idx_; }
-  bool Empty() const { return d1_.Empty() || d2_.Empty(); }
-
-  cub::DoubleBuffer<T> &buff() { return buff_; }
-
-  DVec<T> &D1() { return d1_; }
-
-  DVec<T> &D2() { return d2_; }
-
-  T *Current() { return buff_.Current(); }
-  xgboost::common::Span<T> CurrentSpan() {
-    return xgboost::common::Span<T>{
-      buff_.Current(),
-      static_cast<typename xgboost::common::Span<T>::index_type>(Size())};
-  }
-
-  DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); }
-
-  T *other() { return buff_.Alternate(); }
-};
+/**
+ * \brief Device to device memory copy from src to dst. Spans must be the same size. Use subspan to
+ *        copy from a smaller array to a larger array.
+ *
+ * \tparam  T Generic type parameter.
+ * \param dst Copy destination. Must be device memory.
+ * \param src Copy source. Must be device memory.
+ */
+template <typename T>
+void CopyDeviceSpan(xgboost::common::Span<T> dst,
+                    xgboost::common::Span<T> src) {
+  CHECK_EQ(dst.size(), src.size());
+  dh::safe_cuda(cudaMemcpyAsync(dst.data(), src.data(), dst.size() * sizeof(T),
+                                cudaMemcpyDeviceToDevice));
+}

 /*! \brief Helper for allocating large block of memory. */
-template <MemoryType MemoryT>
 class BulkAllocator {
  std::vector<char *> d_ptr_;
  std::vector<size_t> size_;
@@ -413,70 +313,73 @@ class BulkAllocator {
  }

  template <typename T>
-  size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size) {
+  size_t GetSizeBytes(xgboost::common::Span<T> *first_vec, size_t first_size) {
    return AlignRoundUp(first_size * sizeof(T));
  }

  template <typename T, typename... Args>
-  size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size, Args... args) {
+  size_t GetSizeBytes(xgboost::common::Span<T> *first_vec, size_t first_size, Args... args) {
    return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
  }

  template <typename T>
-  void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec,
-                     size_t first_size) {
-    first_vec->ExternalAllocate(device_idx, static_cast<void *>(ptr),
-                                 first_size);
+  void AllocateSpan(int device_idx, char *ptr, xgboost::common::Span<T> *first_vec,
+    size_t first_size) {
+    *first_vec = xgboost::common::Span<T>(reinterpret_cast<T *>(ptr), first_size);
  }

  template <typename T, typename... Args>
-  void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec,
-                     size_t first_size, Args... args) {
-    AllocateDVec<T>(device_idx, ptr, first_vec, first_size);
+  void AllocateSpan(int device_idx, char *ptr, xgboost::common::Span<T> *first_vec,
+    size_t first_size, Args... args) {
+    AllocateSpan<T>(device_idx, ptr, first_vec, first_size);
    ptr += AlignRoundUp(first_size * sizeof(T));
-    AllocateDVec(device_idx, ptr, args...);
+    AllocateSpan(device_idx, ptr, args...);
  }

-  char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) {
+  char *AllocateDevice(int device_idx, size_t bytes) {
    char *ptr;
    safe_cuda(cudaSetDevice(device_idx));
    safe_cuda(cudaMalloc(&ptr, bytes));
    return ptr;
  }
+
  template <typename T>
-  size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size) {
+  size_t GetSizeBytes(DoubleBuffer<T> *first_vec, size_t first_size) {
    return 2 * AlignRoundUp(first_size * sizeof(T));
  }

  template <typename T, typename... Args>
-  size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size, Args... args) {
+  size_t GetSizeBytes(DoubleBuffer<T> *first_vec, size_t first_size, Args... args) {
    return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
  }

  template <typename T>
-  void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec,
-                     size_t first_size) {
-    first_vec->ExternalAllocate(
-        device_idx, static_cast<void *>(ptr),
-        static_cast<void *>(ptr + AlignRoundUp(first_size * sizeof(T))),
-        first_size);
+  void AllocateSpan(int device_idx, char *ptr, DoubleBuffer<T> *first_vec,
+                    size_t first_size) {
+    auto ptr1 = reinterpret_cast<T *>(ptr);
+    auto ptr2 = ptr1 + first_size;
+    first_vec->a = xgboost::common::Span<T>(ptr1, first_size);
+    first_vec->b = xgboost::common::Span<T>(ptr2, first_size);
+    first_vec->buff.d_buffers[0] = ptr1;
+    first_vec->buff.d_buffers[1] = ptr2;
+    first_vec->buff.selector = 0;
  }

  template <typename T, typename... Args>
-  void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec,
+  void AllocateSpan(int device_idx, char *ptr, DoubleBuffer<T> *first_vec,
                     size_t first_size, Args... args) {
-    AllocateDVec<T>(device_idx, ptr, first_vec, first_size);
+    AllocateSpan<T>(device_idx, ptr, first_vec, first_size);
    ptr += (AlignRoundUp(first_size * sizeof(T)) * 2);
-    AllocateDVec(device_idx, ptr, args...);
+    AllocateSpan(device_idx, ptr, args...);
  }

 public:
   BulkAllocator() = default;
  // prevent accidental copying, moving or assignment of this object
-  BulkAllocator(const BulkAllocator<MemoryT>&) = delete;
-  BulkAllocator(BulkAllocator<MemoryT>&&) = delete;
-  void operator=(const BulkAllocator<MemoryT>&) = delete;
-  void operator=(BulkAllocator<MemoryT>&&) = delete;
+  BulkAllocator(const BulkAllocator&) = delete;
+  BulkAllocator(BulkAllocator&&) = delete;
+  void operator=(const BulkAllocator&) = delete;
+  void operator=(BulkAllocator&&) = delete;

  ~BulkAllocator() {
    for (size_t i = 0; i < d_ptr_.size(); i++) {
@@ -497,9 +400,9 @@ class BulkAllocator {
  void Allocate(int device_idx, Args... args) {
    size_t size = GetSizeBytes(args...);

-    char *ptr = AllocateDevice(device_idx, size, MemoryT);
+    char *ptr = AllocateDevice(device_idx, size);

-    AllocateDVec(device_idx, ptr, args...);
+    AllocateSpan(device_idx, ptr, args...);

    d_ptr_.push_back(ptr);
    size_.push_back(size);
@@ -582,28 +485,6 @@ struct CubMemory {
 *  Utility functions
 */

-template <typename T>
-void Print(const DVec<T> &v, size_t max_items = 10) {
-  std::vector<T> h = v.as_vector();
-  for (size_t i = 0; i < std::min(max_items, h.size()); i++) {
-    std::cout << " " << h[i];
-  }
-  std::cout << "\n";
-}
-
-/**
- * @brief Helper macro to measure timing on GPU
- * @param call the GPU call
- * @param name name used to track later
- * @param stream cuda stream where to measure time
- */
-#define TIMEIT(call, name)    \
-  do {                        \
-    dh::Timer t1234;          \
-    call;                     \
-    t1234.printElapsed(name); \
-  } while (0)
-
 // Load balancing search

 template <typename CoordinateT, typename SegmentT, typename OffsetT>
@@ -762,18 +643,18 @@ void TransformLbs(int device_idx, dh::CubMemory *temp_memory, OffsetT count,
 * @param offsets the segments
 */
 template <typename T1, typename T2>
-void SegmentedSort(dh::CubMemory *tmp_mem, dh::DVec2<T1> *keys,
-                   dh::DVec2<T2> *vals, int nVals, int nSegs,
-                   const dh::DVec<int> &offsets, int start = 0,
+void SegmentedSort(dh::CubMemory *tmp_mem, dh::DoubleBuffer<T1> *keys,
+                   dh::DoubleBuffer<T2> *vals, int nVals, int nSegs,
+                   xgboost::common::Span<int> offsets, int start = 0,
                   int end = sizeof(T1) * 8) {
  size_t tmpSize;
  dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
-      NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.Data(),
-      offsets.Data() + 1, start, end));
+      NULL, tmpSize, keys->CubBuffer(), vals->CubBuffer(), nVals, nSegs,
+      offsets.data(), offsets.data() + 1, start, end));
  tmp_mem->LazyAllocate(tmpSize);
  dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
-      tmp_mem->d_temp_storage, tmpSize, keys->buff(), vals->buff(), nVals,
-      nSegs, offsets.Data(), offsets.Data() + 1, start, end));
+      tmp_mem->d_temp_storage, tmpSize, keys->CubBuffer(), vals->CubBuffer(),
+      nVals, nSegs, offsets.data(), offsets.data() + 1, start, end));
 }

 /**
@@ -784,14 +665,14 @@ void SegmentedSort(dh::CubMemory *tmp_mem, dh::DVec2<T1> *keys,
 * @param nVals number of elements in the input array
 */
 template <typename T>
-void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out,
+void SumReduction(dh::CubMemory &tmp_mem, xgboost::common::Span<T> in, xgboost::common::Span<T> out,
                  int nVals) {
  size_t tmpSize;
  dh::safe_cuda(
-      cub::DeviceReduce::Sum(NULL, tmpSize, in.Data(), out.Data(), nVals));
+      cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
  tmp_mem.LazyAllocate(tmpSize);
  dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
-                                       in.Data(), out.Data(), nVals));
+                                       in.data(), out.data(), nVals));
 }

 /**