reset device helper

This commit is contained in:
Hendrik Groove 2024-10-21 23:53:19 +02:00
parent 1c666db349
commit 2a554ba4a7
2 changed files with 35 additions and 50 deletions

View File

@ -56,7 +56,7 @@
#define cudaPointerGetAttributes hipPointerGetAttributes #define cudaPointerGetAttributes hipPointerGetAttributes
/* hipMemoryTypeUnregistered not supported */ /* hipMemoryTypeUnregistered not supported */
#define cudaMemoryTypeUnregistered hipMemoryTypeUnregistered #define cudaMemoryTypeUnregistered hipMemoryTypeUnified
#define cudaMemoryTypeUnified hipMemoryTypeUnified #define cudaMemoryTypeUnified hipMemoryTypeUnified
#define cudaMemoryTypeHost hipMemoryTypeHost #define cudaMemoryTypeHost hipMemoryTypeHost

View File

@ -19,10 +19,6 @@
#include <thrust/transform_scan.h> #include <thrust/transform_scan.h>
#include <thrust/unique.h> #include <thrust/unique.h>
#include <hip/hip_runtime.h>
#include <thrust/system/hip/execution_policy.h>
#include <thrust/system/hip/detail/get_value.h>
#include <algorithm> #include <algorithm>
#include <chrono> #include <chrono>
#include <cstddef> // for size_t #include <cstddef> // for size_t
@ -398,14 +394,13 @@ inline void ThrowOOMError(std::string const& err, size_t bytes) {
template <class T> template <class T>
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> { struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
using SuperT = XGBBaseDeviceAllocator<T>; using SuperT = XGBBaseDeviceAllocator<T>;
using pointer = thrust::device_ptr<T>; using pointer = thrust::device_ptr<T>; // NOLINT
template<typename U> template<typename U>
struct rebind { struct rebind // NOLINT
using other = XGBDefaultDeviceAllocatorImpl<U>; {
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
}; };
pointer allocate(size_t n) { // NOLINT
pointer allocate(size_t n) {
pointer ptr; pointer ptr;
try { try {
ptr = SuperT::allocate(n); ptr = SuperT::allocate(n);
@ -416,15 +411,14 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T)); GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
return ptr; return ptr;
} }
void deallocate(pointer ptr, size_t n) { // NOLINT
void deallocate(pointer ptr, size_t n) {
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
SuperT::deallocate(ptr, n); SuperT::deallocate(ptr, n);
} }
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBDefaultDeviceAllocatorImpl() : SuperT(rmm::cuda_stream_default, rmm::mr::get_current_device_resource()) {} XGBDefaultDeviceAllocatorImpl()
#endif : SuperT(rmm::cuda_stream_default, rmm::mr::get_current_device_resource()) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
}; };
/** /**
@ -447,11 +441,12 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
std::make_unique<hipcub::CachingDeviceAllocator>(2, 9, 29)}; std::make_unique<hipcub::CachingDeviceAllocator>(2, 9, 29)};
return *allocator; return *allocator;
} }
pointer allocate(size_t n) { pointer allocate(size_t n) { // NOLINT
pointer thrust_ptr; pointer thrust_ptr;
if (use_cub_allocator_) { if (use_cub_allocator_) {
T* raw_ptr{nullptr}; T* raw_ptr{nullptr};
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr), n * sizeof(T)); auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
n * sizeof(T));
if (errc != hipSuccess) { if (errc != hipSuccess) {
ThrowOOMError("Caching allocator", n * sizeof(T)); ThrowOOMError("Caching allocator", n * sizeof(T));
} }
@ -467,7 +462,7 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T)); GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
return thrust_ptr; return thrust_ptr;
} }
void deallocate(pointer ptr, size_t n) { void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
if (use_cub_allocator_) { if (use_cub_allocator_) {
GetGlobalCachingAllocator().DeviceFree(ptr.get()); GetGlobalCachingAllocator().DeviceFree(ptr.get());
@ -606,7 +601,8 @@ void CopyToD(HContainer const &h, DContainer *d) {
d->resize(h.size()); d->resize(h.size());
using HVT = std::remove_cv_t<typename HContainer::value_type>; using HVT = std::remove_cv_t<typename HContainer::value_type>;
using DVT = std::remove_cv_t<typename DContainer::value_type>; using DVT = std::remove_cv_t<typename DContainer::value_type>;
static_assert(std::is_same<HVT, DVT>::value, "Host and device containers must have same value type."); static_assert(std::is_same<HVT, DVT>::value,
"Host and device containers must have same value type.");
dh::safe_cuda(hipMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT), dh::safe_cuda(hipMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT),
hipMemcpyHostToDevice)); hipMemcpyHostToDevice));
} }
@ -658,22 +654,20 @@ struct PinnedMemory {
template <typename T> template <typename T>
typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) { typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
using ValueT = typename std::iterator_traits<T>::value_type; using ValueT = typename std::iterator_traits<T>::value_type;
size_t tmpSize {0}; size_t tmpSize {0};
ValueT *dummy_out = nullptr; ValueT *dummy_out = nullptr;
try {
dh::safe_cuda(hipcub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals)); dh::safe_cuda(hipcub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals));
TemporaryArray<char> temp(tmpSize + sizeof(ValueT)); TemporaryArray<char> temp(tmpSize + sizeof(ValueT));
auto ptr = reinterpret_cast<ValueT *>(temp.data().get()) + 1; auto ptr = reinterpret_cast<ValueT *>(temp.data().get()) + 1;
dh::safe_cuda(hipcub::DeviceReduce::Sum( dh::safe_cuda(hipcub::DeviceReduce::Sum(
reinterpret_cast<void *>(ptr), tmpSize, in, reinterpret_cast<ValueT *>(temp.data().get()), nVals)); reinterpret_cast<void *>(ptr), tmpSize, in,
reinterpret_cast<ValueT *>(temp.data().get()),
nVals));
ValueT sum; ValueT sum;
dh::safe_cuda(hipMemcpy(&sum, temp.data().get(), sizeof(ValueT), hipMemcpyDeviceToHost)); dh::safe_cuda(hipMemcpy(&sum, temp.data().get(), sizeof(ValueT),
hipMemcpyDeviceToHost));
return sum; return sum;
} catch (const std::exception& e) {
throw;
}
} }
constexpr std::pair<int, int> CUDAVersion() { constexpr std::pair<int, int> CUDAVersion() {
@ -961,24 +955,15 @@ template <typename Policy, typename InputIt, typename Init, typename Func>
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) { auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2; size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
size_t size = std::distance(first, second); size_t size = std::distance(first, second);
using Ty = std::remove_cv_t<Init>; using Ty = std::remove_cv_t<Init>;
Ty aggregate = init; Ty aggregate = init;
for (size_t offset = 0; offset < size; offset += kLimit) { for (size_t offset = 0; offset < size; offset += kLimit) {
auto begin_it = first + offset; auto begin_it = first + offset;
auto end_it = first + std::min(offset + kLimit, size); auto end_it = first + std::min(offset + kLimit, size);
size_t batch_size = std::distance(begin_it, end_it); size_t batch_size = std::distance(begin_it, end_it);
CHECK_LE(batch_size, size); CHECK_LE(batch_size, size);
try {
// Print the iterator types
auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op); auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op);
aggregate = reduce_op(aggregate, ret); aggregate = reduce_op(aggregate, ret);
} catch (const thrust::system_error& e) {
throw;
} catch (const std::exception& e) {
throw;
}
} }
return aggregate; return aggregate;
} }
@ -1066,7 +1051,7 @@ inline CUDAStreamView DefaultStream() {
#ifdef HIP_API_PER_THREAD_DEFAULT_STREAM #ifdef HIP_API_PER_THREAD_DEFAULT_STREAM
return CUDAStreamView{hipStreamPerThread}; return CUDAStreamView{hipStreamPerThread};
#else #else
return CUDAStreamView{hipStreamDefault}; return CUDAStreamView{hipStreamLegacy};
#endif #endif
} }