Retire DVec class in favour of c++20 style span for device memory. (#4293)

This commit is contained in:
Rory Mitchell
2019-03-28 13:59:58 +13:00
committed by GitHub
parent c85181dd8a
commit 3f312e30db
7 changed files with 288 additions and 369 deletions

View File

@@ -227,179 +227,79 @@ inline void LaunchN(int device_idx, size_t n, L lambda) {
LaunchN<ITEMS_PER_THREAD, BLOCK_THREADS>(device_idx, n, nullptr, lambda);
}
/*
* Memory
/**
* \brief A double buffer, useful for algorithms like sort.
*/
enum MemoryType { kDevice, kDeviceManaged };
template <MemoryType MemoryT>
class BulkAllocator;
template <typename T>
class DVec2;
template <typename T>
class DVec {
friend class DVec2<T>;
private:
T *ptr_;
size_t size_;
int device_idx_;
class DoubleBuffer {
public:
void ExternalAllocate(int device_idx, void *ptr, size_t size) {
if (!Empty()) {
throw std::runtime_error("Tried to allocate DVec but already allocated");
}
ptr_ = static_cast<T *>(ptr);
size_ = size;
device_idx_ = device_idx;
safe_cuda(cudaSetDevice(device_idx_));
cub::DoubleBuffer<T> buff;
xgboost::common::Span<T> a, b;
DoubleBuffer() = default;
size_t Size() const {
CHECK_EQ(a.size(), b.size());
return a.size();
}
cub::DoubleBuffer<T> &CubBuffer() { return buff; }
T *Current() { return buff.Current(); }
xgboost::common::Span<T> CurrentSpan() {
return xgboost::common::Span<T>{
buff.Current(),
static_cast<typename xgboost::common::Span<T>::index_type>(Size())};
}
DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
size_t Size() const { return size_; }
int DeviceIdx() const { return device_idx_; }
bool Empty() const { return ptr_ == NULL || size_ == 0; }
T *Data() { return ptr_; }
const T *Data() const { return ptr_; }
xgboost::common::Span<const T> GetSpan() const {
return xgboost::common::Span<const T>(ptr_, this->Size());
}
xgboost::common::Span<T> GetSpan() {
return xgboost::common::Span<T>(ptr_, this->Size());
}
std::vector<T> AsVector() const {
std::vector<T> h_vector(Size());
safe_cuda(cudaSetDevice(device_idx_));
safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
cudaMemcpyDeviceToHost));
return h_vector;
}
void Fill(T value) {
auto d_ptr = ptr_;
LaunchN(device_idx_, Size(),
[=] __device__(size_t idx) { d_ptr[idx] = value; });
}
void Print() {
auto h_vector = this->AsVector();
for (auto e : h_vector) {
std::cout << e << " ";
}
std::cout << "\n";
}
thrust::device_ptr<T> tbegin() { return thrust::device_pointer_cast(ptr_); }
thrust::device_ptr<T> tend() {
return thrust::device_pointer_cast(ptr_ + Size());
}
template <typename T2>
DVec &operator=(const std::vector<T2> &other) {
this->copy(other.begin(), other.end());
return *this;
}
DVec &operator=(DVec<T> &other) {
if (other.Size() != Size()) {
throw std::runtime_error(
"Cannot copy assign DVec to DVec, sizes are different");
}
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (other.DeviceIdx() == this->DeviceIdx()) {
dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
other.Size() * sizeof(T),
cudaMemcpyDeviceToDevice));
} else {
std::cout << "deviceother: " << other.DeviceIdx()
<< " devicethis: " << this->DeviceIdx() << std::endl;
std::cout << "size deviceother: " << other.Size()
<< " devicethis: " << this->DeviceIdx() << std::endl;
throw std::runtime_error("Cannot copy to/from different devices");
}
return *this;
}
template <typename IterT>
void copy(IterT begin, IterT end) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
LOG(FATAL) << "Cannot copy assign vector to DVec, sizes are different" <<
" vector::Size(): " << end - begin << " DVec::Size(): " << Size();
}
thrust::copy(begin, end, this->tbegin());
}
void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
}
safe_cuda(cudaMemcpyAsync(this->Data(), begin.get(), Size() * sizeof(T),
cudaMemcpyDefault));
}
T *other() { return buff.Alternate(); }
};
/**
* @class DVec2 device_helpers.cuh
* @brief wrapper for storing 2 DVec's which are needed for cub::DoubleBuffer
* \brief Copies device span to std::vector.
*
* \tparam T Generic type parameter.
* \param [in,out] dst Copy destination.
* \param src Copy source. Must be device memory.
*/
template <typename T>
class DVec2 {
private:
DVec<T> d1_, d2_;
cub::DoubleBuffer<T> buff_;
int device_idx_;
void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<T> src) {
CHECK_EQ(dst->size(), src.size());
dh::safe_cuda(cudaMemcpyAsync(dst->data(), src.data(), dst->size() * sizeof(T),
cudaMemcpyDeviceToHost));
}
public:
void ExternalAllocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
if (!Empty()) {
throw std::runtime_error("Tried to allocate DVec2 but already allocated");
}
device_idx_ = device_idx;
d1_.ExternalAllocate(device_idx_, ptr1, size);
d2_.ExternalAllocate(device_idx_, ptr2, size);
buff_.d_buffers[0] = static_cast<T *>(ptr1);
buff_.d_buffers[1] = static_cast<T *>(ptr2);
buff_.selector = 0;
}
DVec2() : d1_(), d2_(), buff_(), device_idx_(-1) {}
/**
* \brief Copies std::vector to device span.
*
* \tparam T Generic type parameter.
* \param dst Copy destination. Must be device memory.
* \param src Copy source.
*/
template <typename T>
void CopyVectorToDeviceSpan(xgboost::common::Span<T> dst ,const std::vector<T>&src)
{
CHECK_EQ(dst.size(), src.size());
dh::safe_cuda(cudaMemcpyAsync(dst.data(), src.data(), dst.size() * sizeof(T),
cudaMemcpyHostToDevice));
}
size_t Size() const { return d1_.Size(); }
int DeviceIdx() const { return device_idx_; }
bool Empty() const { return d1_.Empty() || d2_.Empty(); }
cub::DoubleBuffer<T> &buff() { return buff_; }
DVec<T> &D1() { return d1_; }
DVec<T> &D2() { return d2_; }
T *Current() { return buff_.Current(); }
xgboost::common::Span<T> CurrentSpan() {
return xgboost::common::Span<T>{
buff_.Current(),
static_cast<typename xgboost::common::Span<T>::index_type>(Size())};
}
DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); }
T *other() { return buff_.Alternate(); }
};
/**
* \brief Device to device memory copy from src to dst. Spans must be the same size. Use subspan to
* copy from a smaller array to a larger array.
*
* \tparam T Generic type parameter.
* \param dst Copy destination. Must be device memory.
* \param src Copy source. Must be device memory.
*/
template <typename T>
void CopyDeviceSpan(xgboost::common::Span<T> dst,
xgboost::common::Span<T> src) {
CHECK_EQ(dst.size(), src.size());
dh::safe_cuda(cudaMemcpyAsync(dst.data(), src.data(), dst.size() * sizeof(T),
cudaMemcpyDeviceToDevice));
}
/*! \brief Helper for allocating large block of memory. */
template <MemoryType MemoryT>
class BulkAllocator {
std::vector<char *> d_ptr_;
std::vector<size_t> size_;
@@ -413,70 +313,73 @@ class BulkAllocator {
}
template <typename T>
size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size) {
size_t GetSizeBytes(xgboost::common::Span<T> *first_vec, size_t first_size) {
return AlignRoundUp(first_size * sizeof(T));
}
template <typename T, typename... Args>
size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size, Args... args) {
size_t GetSizeBytes(xgboost::common::Span<T> *first_vec, size_t first_size, Args... args) {
return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
}
template <typename T>
void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec,
size_t first_size) {
first_vec->ExternalAllocate(device_idx, static_cast<void *>(ptr),
first_size);
void AllocateSpan(int device_idx, char *ptr, xgboost::common::Span<T> *first_vec,
size_t first_size) {
*first_vec = xgboost::common::Span<T>(reinterpret_cast<T *>(ptr), first_size);
}
template <typename T, typename... Args>
void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec,
size_t first_size, Args... args) {
AllocateDVec<T>(device_idx, ptr, first_vec, first_size);
void AllocateSpan(int device_idx, char *ptr, xgboost::common::Span<T> *first_vec,
size_t first_size, Args... args) {
AllocateSpan<T>(device_idx, ptr, first_vec, first_size);
ptr += AlignRoundUp(first_size * sizeof(T));
AllocateDVec(device_idx, ptr, args...);
AllocateSpan(device_idx, ptr, args...);
}
char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) {
char *AllocateDevice(int device_idx, size_t bytes) {
char *ptr;
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaMalloc(&ptr, bytes));
return ptr;
}
template <typename T>
size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size) {
size_t GetSizeBytes(DoubleBuffer<T> *first_vec, size_t first_size) {
return 2 * AlignRoundUp(first_size * sizeof(T));
}
template <typename T, typename... Args>
size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size, Args... args) {
size_t GetSizeBytes(DoubleBuffer<T> *first_vec, size_t first_size, Args... args) {
return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
}
template <typename T>
void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec,
size_t first_size) {
first_vec->ExternalAllocate(
device_idx, static_cast<void *>(ptr),
static_cast<void *>(ptr + AlignRoundUp(first_size * sizeof(T))),
first_size);
void AllocateSpan(int device_idx, char *ptr, DoubleBuffer<T> *first_vec,
size_t first_size) {
auto ptr1 = reinterpret_cast<T *>(ptr);
auto ptr2 = ptr1 + first_size;
first_vec->a = xgboost::common::Span<T>(ptr1, first_size);
first_vec->b = xgboost::common::Span<T>(ptr2, first_size);
first_vec->buff.d_buffers[0] = ptr1;
first_vec->buff.d_buffers[1] = ptr2;
first_vec->buff.selector = 0;
}
template <typename T, typename... Args>
void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec,
void AllocateSpan(int device_idx, char *ptr, DoubleBuffer<T> *first_vec,
size_t first_size, Args... args) {
AllocateDVec<T>(device_idx, ptr, first_vec, first_size);
AllocateSpan<T>(device_idx, ptr, first_vec, first_size);
ptr += (AlignRoundUp(first_size * sizeof(T)) * 2);
AllocateDVec(device_idx, ptr, args...);
AllocateSpan(device_idx, ptr, args...);
}
public:
BulkAllocator() = default;
// prevent accidental copying, moving or assignment of this object
BulkAllocator(const BulkAllocator<MemoryT>&) = delete;
BulkAllocator(BulkAllocator<MemoryT>&&) = delete;
void operator=(const BulkAllocator<MemoryT>&) = delete;
void operator=(BulkAllocator<MemoryT>&&) = delete;
BulkAllocator(const BulkAllocator&) = delete;
BulkAllocator(BulkAllocator&&) = delete;
void operator=(const BulkAllocator&) = delete;
void operator=(BulkAllocator&&) = delete;
~BulkAllocator() {
for (size_t i = 0; i < d_ptr_.size(); i++) {
@@ -497,9 +400,9 @@ class BulkAllocator {
void Allocate(int device_idx, Args... args) {
size_t size = GetSizeBytes(args...);
char *ptr = AllocateDevice(device_idx, size, MemoryT);
char *ptr = AllocateDevice(device_idx, size);
AllocateDVec(device_idx, ptr, args...);
AllocateSpan(device_idx, ptr, args...);
d_ptr_.push_back(ptr);
size_.push_back(size);
@@ -582,28 +485,6 @@ struct CubMemory {
* Utility functions
*/
template <typename T>
void Print(const DVec<T> &v, size_t max_items = 10) {
std::vector<T> h = v.as_vector();
for (size_t i = 0; i < std::min(max_items, h.size()); i++) {
std::cout << " " << h[i];
}
std::cout << "\n";
}
/**
* @brief Helper macro to measure timing on GPU
* @param call the GPU call
* @param name name used to track later
* @param stream cuda stream where to measure time
*/
#define TIMEIT(call, name) \
do { \
dh::Timer t1234; \
call; \
t1234.printElapsed(name); \
} while (0)
// Load balancing search
template <typename CoordinateT, typename SegmentT, typename OffsetT>
@@ -762,18 +643,18 @@ void TransformLbs(int device_idx, dh::CubMemory *temp_memory, OffsetT count,
* @param offsets the segments
*/
template <typename T1, typename T2>
void SegmentedSort(dh::CubMemory *tmp_mem, dh::DVec2<T1> *keys,
dh::DVec2<T2> *vals, int nVals, int nSegs,
const dh::DVec<int> &offsets, int start = 0,
void SegmentedSort(dh::CubMemory *tmp_mem, dh::DoubleBuffer<T1> *keys,
dh::DoubleBuffer<T2> *vals, int nVals, int nSegs,
xgboost::common::Span<int> offsets, int start = 0,
int end = sizeof(T1) * 8) {
size_t tmpSize;
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.Data(),
offsets.Data() + 1, start, end));
NULL, tmpSize, keys->CubBuffer(), vals->CubBuffer(), nVals, nSegs,
offsets.data(), offsets.data() + 1, start, end));
tmp_mem->LazyAllocate(tmpSize);
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
tmp_mem->d_temp_storage, tmpSize, keys->buff(), vals->buff(), nVals,
nSegs, offsets.Data(), offsets.Data() + 1, start, end));
tmp_mem->d_temp_storage, tmpSize, keys->CubBuffer(), vals->CubBuffer(),
nVals, nSegs, offsets.data(), offsets.data() + 1, start, end));
}
/**
@@ -784,14 +665,14 @@ void SegmentedSort(dh::CubMemory *tmp_mem, dh::DVec2<T1> *keys,
* @param nVals number of elements in the input array
*/
template <typename T>
void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out,
void SumReduction(dh::CubMemory &tmp_mem, xgboost::common::Span<T> in, xgboost::common::Span<T> out,
int nVals) {
size_t tmpSize;
dh::safe_cuda(
cub::DeviceReduce::Sum(NULL, tmpSize, in.Data(), out.Data(), nVals));
cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
tmp_mem.LazyAllocate(tmpSize);
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
in.Data(), out.Data(), nVals));
in.data(), out.data(), nVals));
}
/**