reset device helper
This commit is contained in:
parent
1c666db349
commit
2a554ba4a7
@ -56,7 +56,7 @@
|
|||||||
#define cudaPointerGetAttributes hipPointerGetAttributes
|
#define cudaPointerGetAttributes hipPointerGetAttributes
|
||||||
|
|
||||||
/* hipMemoryTypeUnregistered not supported */
|
/* hipMemoryTypeUnregistered not supported */
|
||||||
#define cudaMemoryTypeUnregistered hipMemoryTypeUnregistered
|
#define cudaMemoryTypeUnregistered hipMemoryTypeUnified
|
||||||
#define cudaMemoryTypeUnified hipMemoryTypeUnified
|
#define cudaMemoryTypeUnified hipMemoryTypeUnified
|
||||||
#define cudaMemoryTypeHost hipMemoryTypeHost
|
#define cudaMemoryTypeHost hipMemoryTypeHost
|
||||||
|
|
||||||
|
|||||||
@ -19,10 +19,6 @@
|
|||||||
#include <thrust/transform_scan.h>
|
#include <thrust/transform_scan.h>
|
||||||
#include <thrust/unique.h>
|
#include <thrust/unique.h>
|
||||||
|
|
||||||
#include <hip/hip_runtime.h>
|
|
||||||
#include <thrust/system/hip/execution_policy.h>
|
|
||||||
#include <thrust/system/hip/detail/get_value.h>
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <cstddef> // for size_t
|
#include <cstddef> // for size_t
|
||||||
@ -398,14 +394,13 @@ inline void ThrowOOMError(std::string const& err, size_t bytes) {
|
|||||||
template <class T>
|
template <class T>
|
||||||
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
||||||
using SuperT = XGBBaseDeviceAllocator<T>;
|
using SuperT = XGBBaseDeviceAllocator<T>;
|
||||||
using pointer = thrust::device_ptr<T>;
|
using pointer = thrust::device_ptr<T>; // NOLINT
|
||||||
|
|
||||||
template<typename U>
|
template<typename U>
|
||||||
struct rebind {
|
struct rebind // NOLINT
|
||||||
using other = XGBDefaultDeviceAllocatorImpl<U>;
|
{
|
||||||
|
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
|
||||||
};
|
};
|
||||||
|
pointer allocate(size_t n) { // NOLINT
|
||||||
pointer allocate(size_t n) {
|
|
||||||
pointer ptr;
|
pointer ptr;
|
||||||
try {
|
try {
|
||||||
ptr = SuperT::allocate(n);
|
ptr = SuperT::allocate(n);
|
||||||
@ -416,15 +411,14 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
|||||||
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
|
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
void deallocate(pointer ptr, size_t n) { // NOLINT
|
||||||
void deallocate(pointer ptr, size_t n) {
|
|
||||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
||||||
SuperT::deallocate(ptr, n);
|
SuperT::deallocate(ptr, n);
|
||||||
}
|
}
|
||||||
|
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
XGBDefaultDeviceAllocatorImpl()
|
||||||
XGBDefaultDeviceAllocatorImpl() : SuperT(rmm::cuda_stream_default, rmm::mr::get_current_device_resource()) {}
|
: SuperT(rmm::cuda_stream_default, rmm::mr::get_current_device_resource()) {}
|
||||||
#endif
|
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -447,11 +441,12 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
|||||||
std::make_unique<hipcub::CachingDeviceAllocator>(2, 9, 29)};
|
std::make_unique<hipcub::CachingDeviceAllocator>(2, 9, 29)};
|
||||||
return *allocator;
|
return *allocator;
|
||||||
}
|
}
|
||||||
pointer allocate(size_t n) {
|
pointer allocate(size_t n) { // NOLINT
|
||||||
pointer thrust_ptr;
|
pointer thrust_ptr;
|
||||||
if (use_cub_allocator_) {
|
if (use_cub_allocator_) {
|
||||||
T* raw_ptr{nullptr};
|
T* raw_ptr{nullptr};
|
||||||
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr), n * sizeof(T));
|
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
|
||||||
|
n * sizeof(T));
|
||||||
if (errc != hipSuccess) {
|
if (errc != hipSuccess) {
|
||||||
ThrowOOMError("Caching allocator", n * sizeof(T));
|
ThrowOOMError("Caching allocator", n * sizeof(T));
|
||||||
}
|
}
|
||||||
@ -467,7 +462,7 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
|||||||
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
|
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
|
||||||
return thrust_ptr;
|
return thrust_ptr;
|
||||||
}
|
}
|
||||||
void deallocate(pointer ptr, size_t n) {
|
void deallocate(pointer ptr, size_t n) { // NOLINT
|
||||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
||||||
if (use_cub_allocator_) {
|
if (use_cub_allocator_) {
|
||||||
GetGlobalCachingAllocator().DeviceFree(ptr.get());
|
GetGlobalCachingAllocator().DeviceFree(ptr.get());
|
||||||
@ -606,7 +601,8 @@ void CopyToD(HContainer const &h, DContainer *d) {
|
|||||||
d->resize(h.size());
|
d->resize(h.size());
|
||||||
using HVT = std::remove_cv_t<typename HContainer::value_type>;
|
using HVT = std::remove_cv_t<typename HContainer::value_type>;
|
||||||
using DVT = std::remove_cv_t<typename DContainer::value_type>;
|
using DVT = std::remove_cv_t<typename DContainer::value_type>;
|
||||||
static_assert(std::is_same<HVT, DVT>::value, "Host and device containers must have same value type.");
|
static_assert(std::is_same<HVT, DVT>::value,
|
||||||
|
"Host and device containers must have same value type.");
|
||||||
dh::safe_cuda(hipMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT),
|
dh::safe_cuda(hipMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT),
|
||||||
hipMemcpyHostToDevice));
|
hipMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
@ -658,22 +654,20 @@ struct PinnedMemory {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
|
typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
|
||||||
using ValueT = typename std::iterator_traits<T>::value_type;
|
using ValueT = typename std::iterator_traits<T>::value_type;
|
||||||
|
|
||||||
size_t tmpSize {0};
|
size_t tmpSize {0};
|
||||||
ValueT *dummy_out = nullptr;
|
ValueT *dummy_out = nullptr;
|
||||||
|
|
||||||
try {
|
|
||||||
dh::safe_cuda(hipcub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals));
|
dh::safe_cuda(hipcub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals));
|
||||||
|
|
||||||
TemporaryArray<char> temp(tmpSize + sizeof(ValueT));
|
TemporaryArray<char> temp(tmpSize + sizeof(ValueT));
|
||||||
auto ptr = reinterpret_cast<ValueT *>(temp.data().get()) + 1;
|
auto ptr = reinterpret_cast<ValueT *>(temp.data().get()) + 1;
|
||||||
dh::safe_cuda(hipcub::DeviceReduce::Sum(
|
dh::safe_cuda(hipcub::DeviceReduce::Sum(
|
||||||
reinterpret_cast<void *>(ptr), tmpSize, in, reinterpret_cast<ValueT *>(temp.data().get()), nVals));
|
reinterpret_cast<void *>(ptr), tmpSize, in,
|
||||||
|
reinterpret_cast<ValueT *>(temp.data().get()),
|
||||||
|
nVals));
|
||||||
ValueT sum;
|
ValueT sum;
|
||||||
dh::safe_cuda(hipMemcpy(&sum, temp.data().get(), sizeof(ValueT), hipMemcpyDeviceToHost));
|
dh::safe_cuda(hipMemcpy(&sum, temp.data().get(), sizeof(ValueT),
|
||||||
|
hipMemcpyDeviceToHost));
|
||||||
return sum;
|
return sum;
|
||||||
} catch (const std::exception& e) {
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr std::pair<int, int> CUDAVersion() {
|
constexpr std::pair<int, int> CUDAVersion() {
|
||||||
@ -961,24 +955,15 @@ template <typename Policy, typename InputIt, typename Init, typename Func>
|
|||||||
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
|
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
|
||||||
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
|
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
|
||||||
size_t size = std::distance(first, second);
|
size_t size = std::distance(first, second);
|
||||||
|
|
||||||
using Ty = std::remove_cv_t<Init>;
|
using Ty = std::remove_cv_t<Init>;
|
||||||
Ty aggregate = init;
|
Ty aggregate = init;
|
||||||
|
|
||||||
for (size_t offset = 0; offset < size; offset += kLimit) {
|
for (size_t offset = 0; offset < size; offset += kLimit) {
|
||||||
auto begin_it = first + offset;
|
auto begin_it = first + offset;
|
||||||
auto end_it = first + std::min(offset + kLimit, size);
|
auto end_it = first + std::min(offset + kLimit, size);
|
||||||
size_t batch_size = std::distance(begin_it, end_it);
|
size_t batch_size = std::distance(begin_it, end_it);
|
||||||
CHECK_LE(batch_size, size);
|
CHECK_LE(batch_size, size);
|
||||||
try {
|
|
||||||
// Print the iterator types
|
|
||||||
auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op);
|
auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op);
|
||||||
aggregate = reduce_op(aggregate, ret);
|
aggregate = reduce_op(aggregate, ret);
|
||||||
} catch (const thrust::system_error& e) {
|
|
||||||
throw;
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return aggregate;
|
return aggregate;
|
||||||
}
|
}
|
||||||
@ -1066,7 +1051,7 @@ inline CUDAStreamView DefaultStream() {
|
|||||||
#ifdef HIP_API_PER_THREAD_DEFAULT_STREAM
|
#ifdef HIP_API_PER_THREAD_DEFAULT_STREAM
|
||||||
return CUDAStreamView{hipStreamPerThread};
|
return CUDAStreamView{hipStreamPerThread};
|
||||||
#else
|
#else
|
||||||
return CUDAStreamView{hipStreamDefault};
|
return CUDAStreamView{hipStreamLegacy};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user