Overload device memory allocation (#4532)
* Group source files, include headers in source files * Overload device memory allocation
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
#pragma once
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/device_malloc_allocator.h>
|
||||
#include <thrust/system/cuda/error.h>
|
||||
#include <thrust/system_error.h>
|
||||
#include <xgboost/logging.h>
|
||||
@@ -49,11 +50,6 @@ inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
T *Raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
}
|
||||
|
||||
inline void CudaCheckPointerDevice(void* ptr) {
|
||||
cudaPointerAttributes attr;
|
||||
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
|
||||
@@ -225,6 +221,97 @@ inline void LaunchN(int device_idx, size_t n, L lambda) {
|
||||
LaunchN<ITEMS_PER_THREAD, BLOCK_THREADS>(device_idx, n, nullptr, lambda);
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
/** \brief Keeps track of global device memory allocations. Thread safe.*/
|
||||
class MemoryLogger {
|
||||
// Information for a single device
|
||||
struct DeviceStats {
|
||||
size_t currently_allocated_bytes{ 0 };
|
||||
size_t peak_allocated_bytes{ 0 };
|
||||
size_t num_allocations{ 0 };
|
||||
size_t num_deallocations{ 0 };
|
||||
std::map<void *, size_t> device_allocations;
|
||||
void RegisterAllocation(void *ptr, size_t n) {
|
||||
device_allocations[ptr] = n;
|
||||
currently_allocated_bytes += n;
|
||||
peak_allocated_bytes =
|
||||
std::max(peak_allocated_bytes, currently_allocated_bytes);
|
||||
num_allocations++;
|
||||
}
|
||||
void RegisterDeallocation(void *ptr) {
|
||||
num_deallocations++;
|
||||
currently_allocated_bytes -= device_allocations[ptr];
|
||||
device_allocations.erase(ptr);
|
||||
}
|
||||
};
|
||||
std::map<int, DeviceStats>
|
||||
stats_; // Map device ordinal to memory information
|
||||
std::mutex mutex_;
|
||||
|
||||
public:
|
||||
void RegisterAllocation(void *ptr, size_t n) {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
|
||||
return;
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_[current_device].RegisterAllocation(ptr, n);
|
||||
}
|
||||
void RegisterDeallocation(void *ptr) {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
|
||||
return;
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_[current_device].RegisterDeallocation(ptr);
|
||||
}
|
||||
void Log() {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug))
|
||||
return;
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
for (const auto &kv : stats_) {
|
||||
LOG(CONSOLE) << "======== Device " << kv.first << " Memory Allocations: "
|
||||
<< " ========";
|
||||
LOG(CONSOLE) << "Peak memory usage: "
|
||||
<< kv.second.peak_allocated_bytes / 1000000 << "mb";
|
||||
LOG(CONSOLE) << "Number of allocations: " << kv.second.num_allocations;
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
inline detail::MemoryLogger &GlobalMemoryLogger() {
|
||||
static detail::MemoryLogger memory_logger;
|
||||
return memory_logger;
|
||||
}
|
||||
|
||||
namespace detail{
|
||||
/**
|
||||
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
|
||||
*/
|
||||
template <class T>
|
||||
struct XGBDefaultDeviceAllocator : thrust::device_malloc_allocator<T> {
|
||||
using super_t = thrust::device_malloc_allocator<T>;
|
||||
using pointer = thrust::device_ptr<T>;
|
||||
pointer allocate(size_t n) {
|
||||
pointer ptr = super_t::allocate(n);
|
||||
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n);
|
||||
return ptr;
|
||||
}
|
||||
void deallocate(pointer ptr, size_t n) {
|
||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get());
|
||||
return super_t::deallocate(ptr, n);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
// Declare xgboost allocator
|
||||
// Replacement of allocator with custom backend should occur here
|
||||
template <typename T>
|
||||
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocator<T>;
|
||||
/** \brief Specialisation of thrust device vector using custom allocator. */
|
||||
template <typename T>
|
||||
using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>;
|
||||
|
||||
/**
|
||||
* \brief A double buffer, useful for algorithms like sort.
|
||||
@@ -335,10 +422,9 @@ class BulkAllocator {
|
||||
}
|
||||
|
||||
char *AllocateDevice(int device_idx, size_t bytes) {
|
||||
char *ptr;
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaMalloc(&ptr, bytes));
|
||||
return ptr;
|
||||
XGBDeviceAllocator<char> allocator;
|
||||
return allocator.allocate(bytes).get();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -383,7 +469,8 @@ class BulkAllocator {
|
||||
for (size_t i = 0; i < d_ptr_.size(); i++) {
|
||||
if (!(d_ptr_[i] == nullptr)) {
|
||||
safe_cuda(cudaSetDevice(device_idx_[i]));
|
||||
safe_cuda(cudaFree(d_ptr_[i]));
|
||||
XGBDeviceAllocator<char> allocator;
|
||||
allocator.deallocate(thrust::device_ptr<char>(d_ptr_[i]), size_[i]);
|
||||
d_ptr_[i] = nullptr;
|
||||
}
|
||||
}
|
||||
@@ -453,14 +540,17 @@ struct CubMemory {
|
||||
|
||||
void Free() {
|
||||
if (this->IsAllocated()) {
|
||||
safe_cuda(cudaFree(d_temp_storage));
|
||||
XGBDeviceAllocator<uint8_t> allocator;
|
||||
allocator.deallocate(thrust::device_ptr<uint8_t>(static_cast<uint8_t *>(d_temp_storage)),
|
||||
temp_storage_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
void LazyAllocate(size_t num_bytes) {
|
||||
if (num_bytes > temp_storage_bytes) {
|
||||
Free();
|
||||
safe_cuda(cudaMalloc(&d_temp_storage, num_bytes));
|
||||
XGBDeviceAllocator<uint8_t> allocator;
|
||||
d_temp_storage = static_cast<void *>(allocator.allocate(num_bytes).get());
|
||||
temp_storage_bytes = num_bytes;
|
||||
}
|
||||
}
|
||||
@@ -1119,7 +1209,7 @@ ReduceT ReduceShards(std::vector<ShardT> *shards, FunctionT f) {
|
||||
template <typename T,
|
||||
typename IndexT = typename xgboost::common::Span<T>::index_type>
|
||||
xgboost::common::Span<T> ToSpan(
|
||||
thrust::device_vector<T>& vec,
|
||||
device_vector<T>& vec,
|
||||
IndexT offset = 0,
|
||||
IndexT size = -1) {
|
||||
size = size == -1 ? vec.size() : size;
|
||||
|
||||
@@ -130,18 +130,18 @@ struct GPUSketcher {
|
||||
|
||||
tree::TrainParam param_;
|
||||
SketchContainer *sketch_container_;
|
||||
thrust::device_vector<size_t> row_ptrs_;
|
||||
thrust::device_vector<Entry> entries_;
|
||||
thrust::device_vector<bst_float> fvalues_;
|
||||
thrust::device_vector<bst_float> feature_weights_;
|
||||
thrust::device_vector<bst_float> fvalues_cur_;
|
||||
thrust::device_vector<WXQSketch::Entry> cuts_d_;
|
||||
dh::device_vector<size_t> row_ptrs_;
|
||||
dh::device_vector<Entry> entries_;
|
||||
dh::device_vector<bst_float> fvalues_;
|
||||
dh::device_vector<bst_float> feature_weights_;
|
||||
dh::device_vector<bst_float> fvalues_cur_;
|
||||
dh::device_vector<WXQSketch::Entry> cuts_d_;
|
||||
thrust::host_vector<WXQSketch::Entry> cuts_h_;
|
||||
thrust::device_vector<bst_float> weights_;
|
||||
thrust::device_vector<bst_float> weights2_;
|
||||
dh::device_vector<bst_float> weights_;
|
||||
dh::device_vector<bst_float> weights2_;
|
||||
std::vector<size_t> n_cuts_cur_;
|
||||
thrust::device_vector<size_t> num_elements_;
|
||||
thrust::device_vector<char> tmp_storage_;
|
||||
dh::device_vector<size_t> num_elements_;
|
||||
dh::device_vector<char> tmp_storage_;
|
||||
|
||||
public:
|
||||
DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
|
||||
|
||||
@@ -161,7 +161,7 @@ struct HostDeviceVectorImpl {
|
||||
|
||||
private:
|
||||
int device_;
|
||||
thrust::device_vector<T> data_;
|
||||
dh::device_vector<T> data_;
|
||||
// cached vector size
|
||||
size_t cached_size_;
|
||||
size_t start_;
|
||||
|
||||
Reference in New Issue
Block a user