Avoid thrust vector initialization. (#10544)
* Avoid thrust vector initialization. - Add a wrapper for rmm device uvector. - Split up the `Resize` method for HDV.
This commit is contained in:
parent
89da9f9741
commit
1ca4bfd20e
@ -135,7 +135,9 @@ class HostDeviceVector {
|
|||||||
|
|
||||||
void SetDevice(DeviceOrd device) const;
|
void SetDevice(DeviceOrd device) const;
|
||||||
|
|
||||||
void Resize(size_t new_size, T v = T());
|
void Resize(std::size_t new_size);
|
||||||
|
/** @brief Resize and initialize the data if the new size is larger than the old size. */
|
||||||
|
void Resize(std::size_t new_size, T v);
|
||||||
|
|
||||||
using value_type = T; // NOLINT
|
using value_type = T; // NOLINT
|
||||||
|
|
||||||
|
|||||||
@ -18,7 +18,7 @@ struct CUDAContext {
|
|||||||
* \brief Caching thrust policy.
|
* \brief Caching thrust policy.
|
||||||
*/
|
*/
|
||||||
auto CTP() const {
|
auto CTP() const {
|
||||||
#if THRUST_MAJOR_VERSION >= 2
|
#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
|
||||||
return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
|
return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
|
||||||
#else
|
#else
|
||||||
return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
|
return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
|
||||||
|
|||||||
@ -1,26 +1,21 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2017-2023 XGBoost contributors
|
* Copyright 2017-2024, XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <thrust/binary_search.h> // thrust::upper_bound
|
#include <thrust/binary_search.h> // thrust::upper_bound
|
||||||
#include <thrust/device_malloc_allocator.h>
|
#include <thrust/device_ptr.h> // for device_ptr
|
||||||
#include <thrust/device_ptr.h>
|
#include <thrust/device_vector.h> // for device_vector
|
||||||
#include <thrust/device_vector.h>
|
|
||||||
#include <thrust/execution_policy.h> // thrust::seq
|
#include <thrust/execution_policy.h> // thrust::seq
|
||||||
#include <thrust/gather.h> // gather
|
#include <thrust/iterator/discard_iterator.h> // for discard_iterator
|
||||||
#include <thrust/iterator/discard_iterator.h>
|
|
||||||
#include <thrust/iterator/transform_output_iterator.h> // make_transform_output_iterator
|
#include <thrust/iterator/transform_output_iterator.h> // make_transform_output_iterator
|
||||||
#include <thrust/logical.h>
|
|
||||||
#include <thrust/sequence.h>
|
|
||||||
#include <thrust/system/cuda/error.h>
|
#include <thrust/system/cuda/error.h>
|
||||||
#include <thrust/system_error.h>
|
#include <thrust/system_error.h>
|
||||||
#include <thrust/transform_scan.h>
|
|
||||||
#include <thrust/unique.h>
|
#include <thrust/unique.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstddef> // for size_t
|
#include <cstddef> // for size_t
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
#include <cub/util_allocator.cuh>
|
#include <cub/util_type.cuh> // for UnitWord
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
@ -28,22 +23,14 @@
|
|||||||
|
|
||||||
#include "../collective/communicator-inl.h"
|
#include "../collective/communicator-inl.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "device_vector.cuh"
|
||||||
#include "xgboost/host_device_vector.h"
|
#include "xgboost/host_device_vector.h"
|
||||||
#include "xgboost/logging.h"
|
#include "xgboost/logging.h"
|
||||||
#include "xgboost/span.h"
|
#include "xgboost/span.h"
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
#if defined(XGBOOST_USE_RMM)
|
||||||
#include "rmm/mr/device/per_device_resource.hpp"
|
#include <rmm/exec_policy.hpp>
|
||||||
#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
|
#endif // defined(XGBOOST_USE_RMM)
|
||||||
#include "rmm/version_config.hpp"
|
|
||||||
|
|
||||||
#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
|
|
||||||
#error "Please use RMM version 0.18 or later"
|
|
||||||
#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
|
|
||||||
#error "Please use RMM version 0.18 or later"
|
|
||||||
#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
|
|
||||||
|
|
||||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
|
||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
|
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
|
||||||
|
|
||||||
@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) {
|
|||||||
LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
|
LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace detail {
|
|
||||||
/** \brief Keeps track of global device memory allocations. Thread safe.*/
|
|
||||||
class MemoryLogger {
|
|
||||||
// Information for a single device
|
|
||||||
struct DeviceStats {
|
|
||||||
size_t currently_allocated_bytes{ 0 };
|
|
||||||
size_t peak_allocated_bytes{ 0 };
|
|
||||||
size_t num_allocations{ 0 };
|
|
||||||
size_t num_deallocations{ 0 };
|
|
||||||
std::map<void *, size_t> device_allocations;
|
|
||||||
void RegisterAllocation(void *ptr, size_t n) {
|
|
||||||
device_allocations[ptr] = n;
|
|
||||||
currently_allocated_bytes += n;
|
|
||||||
peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
|
|
||||||
num_allocations++;
|
|
||||||
CHECK_GT(num_allocations, num_deallocations);
|
|
||||||
}
|
|
||||||
void RegisterDeallocation(void *ptr, size_t n, int current_device) {
|
|
||||||
auto itr = device_allocations.find(ptr);
|
|
||||||
if (itr == device_allocations.end()) {
|
|
||||||
LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
|
|
||||||
<< " that was never allocated\n"
|
|
||||||
<< dmlc::StackTrace();
|
|
||||||
} else {
|
|
||||||
num_deallocations++;
|
|
||||||
CHECK_LE(num_deallocations, num_allocations);
|
|
||||||
currently_allocated_bytes -= itr->second;
|
|
||||||
device_allocations.erase(itr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
DeviceStats stats_;
|
|
||||||
std::mutex mutex_;
|
|
||||||
|
|
||||||
public:
|
|
||||||
void RegisterAllocation(void *ptr, size_t n) {
|
|
||||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
std::lock_guard<std::mutex> guard(mutex_);
|
|
||||||
int current_device;
|
|
||||||
safe_cuda(cudaGetDevice(¤t_device));
|
|
||||||
stats_.RegisterAllocation(ptr, n);
|
|
||||||
}
|
|
||||||
void RegisterDeallocation(void *ptr, size_t n) {
|
|
||||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
std::lock_guard<std::mutex> guard(mutex_);
|
|
||||||
int current_device;
|
|
||||||
safe_cuda(cudaGetDevice(¤t_device));
|
|
||||||
stats_.RegisterDeallocation(ptr, n, current_device);
|
|
||||||
}
|
|
||||||
size_t PeakMemory() const {
|
|
||||||
return stats_.peak_allocated_bytes;
|
|
||||||
}
|
|
||||||
size_t CurrentlyAllocatedBytes() const {
|
|
||||||
return stats_.currently_allocated_bytes;
|
|
||||||
}
|
|
||||||
void Clear()
|
|
||||||
{
|
|
||||||
stats_ = DeviceStats();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Log() {
|
|
||||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
std::lock_guard<std::mutex> guard(mutex_);
|
|
||||||
int current_device;
|
|
||||||
safe_cuda(cudaGetDevice(¤t_device));
|
|
||||||
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
|
|
||||||
<< " ========";
|
|
||||||
LOG(CONSOLE) << "Peak memory usage: "
|
|
||||||
<< stats_.peak_allocated_bytes / 1048576 << "MiB";
|
|
||||||
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} // namespace detail
|
|
||||||
|
|
||||||
inline detail::MemoryLogger &GlobalMemoryLogger() {
|
|
||||||
static detail::MemoryLogger memory_logger;
|
|
||||||
return memory_logger;
|
|
||||||
}
|
|
||||||
|
|
||||||
// dh::DebugSyncDevice(__FILE__, __LINE__);
|
// dh::DebugSyncDevice(__FILE__, __LINE__);
|
||||||
inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
|
inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
|
||||||
if (file != "" && line != -1) {
|
if (file != "" && line != -1) {
|
||||||
@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
|
|||||||
safe_cuda(cudaGetLastError());
|
safe_cuda(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace detail {
|
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
|
||||||
template <typename T>
|
|
||||||
using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
|
|
||||||
#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
|
||||||
template <typename T>
|
|
||||||
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
|
|
||||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
|
||||||
|
|
||||||
inline void ThrowOOMError(std::string const& err, size_t bytes) {
|
|
||||||
auto device = CurrentDevice();
|
|
||||||
auto rank = xgboost::collective::GetRank();
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
|
|
||||||
<< "- Free memory: " << AvailableMemory(device) << "\n"
|
|
||||||
<< "- Requested memory: " << bytes << std::endl;
|
|
||||||
LOG(FATAL) << ss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
|
|
||||||
*/
|
|
||||||
template <class T>
|
|
||||||
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
|
||||||
using SuperT = XGBBaseDeviceAllocator<T>;
|
|
||||||
using pointer = thrust::device_ptr<T>; // NOLINT
|
|
||||||
template<typename U>
|
|
||||||
struct rebind // NOLINT
|
|
||||||
{
|
|
||||||
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
|
|
||||||
};
|
|
||||||
pointer allocate(size_t n) { // NOLINT
|
|
||||||
pointer ptr;
|
|
||||||
try {
|
|
||||||
ptr = SuperT::allocate(n);
|
|
||||||
dh::safe_cuda(cudaGetLastError());
|
|
||||||
} catch (const std::exception &e) {
|
|
||||||
ThrowOOMError(e.what(), n * sizeof(T));
|
|
||||||
}
|
|
||||||
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
void deallocate(pointer ptr, size_t n) { // NOLINT
|
|
||||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
|
||||||
SuperT::deallocate(ptr, n);
|
|
||||||
}
|
|
||||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
|
||||||
XGBDefaultDeviceAllocatorImpl()
|
|
||||||
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
|
|
||||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
|
|
||||||
* RMM pool allocator is enabled. Does not initialise memory on construction.
|
|
||||||
*/
|
|
||||||
template <class T>
|
|
||||||
struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
|
||||||
using SuperT = XGBBaseDeviceAllocator<T>;
|
|
||||||
using pointer = thrust::device_ptr<T>; // NOLINT
|
|
||||||
template<typename U>
|
|
||||||
struct rebind // NOLINT
|
|
||||||
{
|
|
||||||
using other = XGBCachingDeviceAllocatorImpl<U>; // NOLINT
|
|
||||||
};
|
|
||||||
cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
|
|
||||||
// Configure allocator with maximum cached bin size of ~1GB and no limit on
|
|
||||||
// maximum cached bytes
|
|
||||||
thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
|
|
||||||
std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
|
|
||||||
return *allocator;
|
|
||||||
}
|
|
||||||
pointer allocate(size_t n) { // NOLINT
|
|
||||||
pointer thrust_ptr;
|
|
||||||
if (use_cub_allocator_) {
|
|
||||||
T* raw_ptr{nullptr};
|
|
||||||
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
|
|
||||||
n * sizeof(T));
|
|
||||||
if (errc != cudaSuccess) {
|
|
||||||
ThrowOOMError("Caching allocator", n * sizeof(T));
|
|
||||||
}
|
|
||||||
thrust_ptr = pointer(raw_ptr);
|
|
||||||
} else {
|
|
||||||
try {
|
|
||||||
thrust_ptr = SuperT::allocate(n);
|
|
||||||
dh::safe_cuda(cudaGetLastError());
|
|
||||||
} catch (const std::exception &e) {
|
|
||||||
ThrowOOMError(e.what(), n * sizeof(T));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
|
|
||||||
return thrust_ptr;
|
|
||||||
}
|
|
||||||
void deallocate(pointer ptr, size_t n) { // NOLINT
|
|
||||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
|
||||||
if (use_cub_allocator_) {
|
|
||||||
GetGlobalCachingAllocator().DeviceFree(ptr.get());
|
|
||||||
} else {
|
|
||||||
SuperT::deallocate(ptr, n);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
|
||||||
XGBCachingDeviceAllocatorImpl()
|
|
||||||
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
|
|
||||||
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
|
|
||||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
|
||||||
XGBOOST_DEVICE void construct(T *) {} // NOLINT
|
|
||||||
private:
|
|
||||||
bool use_cub_allocator_{true};
|
|
||||||
};
|
|
||||||
} // namespace detail
|
|
||||||
|
|
||||||
// Declare xgboost allocators
|
|
||||||
// Replacement of allocator with custom backend should occur here
|
|
||||||
template <typename T>
|
|
||||||
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
|
|
||||||
/*! Be careful that the initialization constructor is a no-op, which means calling
|
|
||||||
* `vec.resize(n)` won't initialize the memory region to 0. Instead use
|
|
||||||
* `vec.resize(n, 0)`*/
|
|
||||||
template <typename T>
|
|
||||||
using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
|
|
||||||
/** \brief Specialisation of thrust device vector using custom allocator. */
|
|
||||||
template <typename T>
|
|
||||||
using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>; // NOLINT
|
|
||||||
template <typename T>
|
|
||||||
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>; // NOLINT
|
|
||||||
|
|
||||||
// Faster to instantiate than caching_device_vector and invokes no synchronisation
|
// Faster to instantiate than caching_device_vector and invokes no synchronisation
|
||||||
// Use this where vector functionality (e.g. resize) is not required
|
// Use this where vector functionality (e.g. resize) is not required
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -734,6 +508,11 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
|
|||||||
return ToSpan(vec, offset, size);
|
return ToSpan(vec, offset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
|
||||||
|
return {thrust::raw_pointer_cast(vec.data()), vec.size()};
|
||||||
|
}
|
||||||
|
|
||||||
// thrust begin, similiar to std::begin
|
// thrust begin, similiar to std::begin
|
||||||
template <typename T>
|
template <typename T>
|
||||||
thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) { // NOLINT
|
thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) { // NOLINT
|
||||||
@ -1117,6 +896,15 @@ class CUDAStream {
|
|||||||
void Sync() { this->View().Sync(); }
|
void Sync() { this->View().Sync(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline auto CachingThrustPolicy() {
|
||||||
|
XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
|
||||||
|
return thrust::cuda::par_nosync(alloc).on(DefaultStream());
|
||||||
|
#else
|
||||||
|
return thrust::cuda::par(alloc).on(DefaultStream());
|
||||||
|
#endif // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
|
||||||
|
}
|
||||||
|
|
||||||
// Force nvcc to load data as constant
|
// Force nvcc to load data as constant
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class LDGIterator {
|
class LDGIterator {
|
||||||
|
|||||||
27
src/common/device_vector.cu
Normal file
27
src/common/device_vector.cu
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2017-2024, XGBoost contributors
|
||||||
|
*/
|
||||||
|
#include "../collective/communicator-inl.h" // for GetRank
|
||||||
|
#include "device_helpers.cuh" // for CurrentDevice
|
||||||
|
#include "device_vector.cuh"
|
||||||
|
|
||||||
|
namespace dh {
|
||||||
|
namespace detail {
|
||||||
|
void ThrowOOMError(std::string const &err, size_t bytes) {
|
||||||
|
auto device = CurrentDevice();
|
||||||
|
auto rank = xgboost::collective::GetRank();
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
|
||||||
|
<< "- Free memory: " << dh::AvailableMemory(device) << "\n"
|
||||||
|
<< "- Requested memory: " << bytes << std::endl;
|
||||||
|
LOG(FATAL) << ss.str();
|
||||||
|
}
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_RMM)
|
||||||
|
LoggingResource *GlobalLoggingResource() {
|
||||||
|
static auto mr{std::make_unique<LoggingResource>()};
|
||||||
|
return mr.get();
|
||||||
|
}
|
||||||
|
#endif // defined(XGBOOST_USE_RMM)
|
||||||
|
} // namespace dh
|
||||||
330
src/common/device_vector.cuh
Normal file
330
src/common/device_vector.cuh
Normal file
@ -0,0 +1,330 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2017-2024, XGBoost Contributors
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
#include <thrust/device_malloc_allocator.h> // for device_malloc_allocator
|
||||||
|
#include <thrust/device_ptr.h> // for device_ptr
|
||||||
|
#include <thrust/device_vector.h> // for device_vector
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
#include <rmm/device_uvector.hpp> // for device_uvector
|
||||||
|
#include <rmm/exec_policy.hpp> // for exec_policy_nosync
|
||||||
|
#include <rmm/mr/device/device_memory_resource.hpp> // for device_memory_resource
|
||||||
|
#include <rmm/mr/device/per_device_resource.hpp> // for get_current_device_resource
|
||||||
|
#include <rmm/mr/device/thrust_allocator_adaptor.hpp> // for thrust_allocator
|
||||||
|
#include <rmm/version_config.hpp> // for RMM_VERSION_MAJOR
|
||||||
|
|
||||||
|
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
|
||||||
|
|
||||||
|
#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
|
||||||
|
|
||||||
|
#error "Please use RMM version 0.18 or later"
|
||||||
|
#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
|
||||||
|
#error "Please use RMM version 0.18 or later"
|
||||||
|
#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
|
||||||
|
|
||||||
|
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
|
||||||
|
#include <cstddef> // for size_t
|
||||||
|
#include <cub/util_allocator.cuh> // for CachingDeviceAllocator
|
||||||
|
#include <cub/util_device.cuh> // for CurrentDevice
|
||||||
|
#include <map> // for map
|
||||||
|
#include <memory> // for unique_ptr
|
||||||
|
|
||||||
|
#include "common.h" // for safe_cuda
|
||||||
|
#include "xgboost/logging.h"
|
||||||
|
|
||||||
|
namespace dh {
|
||||||
|
namespace detail {
|
||||||
|
/** \brief Keeps track of global device memory allocations. Thread safe.*/
|
||||||
|
class MemoryLogger {
|
||||||
|
// Information for a single device
|
||||||
|
struct DeviceStats {
|
||||||
|
std::size_t currently_allocated_bytes{0};
|
||||||
|
size_t peak_allocated_bytes{0};
|
||||||
|
size_t num_allocations{0};
|
||||||
|
size_t num_deallocations{0};
|
||||||
|
std::map<void *, size_t> device_allocations;
|
||||||
|
void RegisterAllocation(void *ptr, size_t n) {
|
||||||
|
device_allocations[ptr] = n;
|
||||||
|
currently_allocated_bytes += n;
|
||||||
|
peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
|
||||||
|
num_allocations++;
|
||||||
|
CHECK_GT(num_allocations, num_deallocations);
|
||||||
|
}
|
||||||
|
void RegisterDeallocation(void *ptr, size_t n, int current_device) {
|
||||||
|
auto itr = device_allocations.find(ptr);
|
||||||
|
if (itr == device_allocations.end()) {
|
||||||
|
LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
|
||||||
|
<< " that was never allocated\n"
|
||||||
|
<< dmlc::StackTrace();
|
||||||
|
} else {
|
||||||
|
num_deallocations++;
|
||||||
|
CHECK_LE(num_deallocations, num_allocations);
|
||||||
|
currently_allocated_bytes -= itr->second;
|
||||||
|
device_allocations.erase(itr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
DeviceStats stats_;
|
||||||
|
std::mutex mutex_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void RegisterAllocation(void *ptr, size_t n) {
|
||||||
|
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::lock_guard<std::mutex> guard(mutex_);
|
||||||
|
stats_.RegisterAllocation(ptr, n);
|
||||||
|
}
|
||||||
|
void RegisterDeallocation(void *ptr, size_t n) {
|
||||||
|
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::lock_guard<std::mutex> guard(mutex_);
|
||||||
|
stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
|
||||||
|
}
|
||||||
|
size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
|
||||||
|
size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; }
|
||||||
|
void Clear() { stats_ = DeviceStats(); }
|
||||||
|
|
||||||
|
void Log() {
|
||||||
|
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::lock_guard<std::mutex> guard(mutex_);
|
||||||
|
int current_device;
|
||||||
|
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||||
|
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
|
||||||
|
<< " ========";
|
||||||
|
LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
|
||||||
|
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void ThrowOOMError(std::string const &err, size_t bytes);
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
inline detail::MemoryLogger &GlobalMemoryLogger() {
|
||||||
|
static detail::MemoryLogger memory_logger;
|
||||||
|
return memory_logger;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
template <typename T>
|
||||||
|
using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
|
||||||
|
#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
template <typename T>
|
||||||
|
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
|
||||||
|
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
|
||||||
|
*/
|
||||||
|
template <class T>
|
||||||
|
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
||||||
|
using SuperT = XGBBaseDeviceAllocator<T>;
|
||||||
|
using pointer = thrust::device_ptr<T>; // NOLINT
|
||||||
|
template <typename U>
|
||||||
|
struct rebind // NOLINT
|
||||||
|
{
|
||||||
|
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
|
||||||
|
};
|
||||||
|
pointer allocate(size_t n) { // NOLINT
|
||||||
|
pointer ptr;
|
||||||
|
try {
|
||||||
|
ptr = SuperT::allocate(n);
|
||||||
|
dh::safe_cuda(cudaGetLastError());
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
detail::ThrowOOMError(e.what(), n * sizeof(T));
|
||||||
|
}
|
||||||
|
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void deallocate(pointer ptr, size_t n) { // NOLINT
|
||||||
|
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
||||||
|
SuperT::deallocate(ptr, n);
|
||||||
|
}
|
||||||
|
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
XGBDefaultDeviceAllocatorImpl()
|
||||||
|
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
|
||||||
|
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
|
||||||
|
* RMM pool allocator is enabled. Does not initialise memory on construction.
|
||||||
|
*/
|
||||||
|
template <class T>
|
||||||
|
struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
||||||
|
using SuperT = XGBBaseDeviceAllocator<T>;
|
||||||
|
using pointer = thrust::device_ptr<T>; // NOLINT
|
||||||
|
template <typename U>
|
||||||
|
struct rebind // NOLINT
|
||||||
|
{
|
||||||
|
using other = XGBCachingDeviceAllocatorImpl<U>; // NOLINT
|
||||||
|
};
|
||||||
|
cub::CachingDeviceAllocator &GetGlobalCachingAllocator() {
|
||||||
|
// Configure allocator with maximum cached bin size of ~1GB and no limit on
|
||||||
|
// maximum cached bytes
|
||||||
|
thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
|
||||||
|
std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
|
||||||
|
return *allocator;
|
||||||
|
}
|
||||||
|
pointer allocate(size_t n) { // NOLINT
|
||||||
|
pointer thrust_ptr;
|
||||||
|
if (use_cub_allocator_) {
|
||||||
|
T *raw_ptr{nullptr};
|
||||||
|
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
|
||||||
|
n * sizeof(T));
|
||||||
|
if (errc != cudaSuccess) {
|
||||||
|
detail::ThrowOOMError("Caching allocator", n * sizeof(T));
|
||||||
|
}
|
||||||
|
thrust_ptr = pointer(raw_ptr);
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
thrust_ptr = SuperT::allocate(n);
|
||||||
|
dh::safe_cuda(cudaGetLastError());
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
detail::ThrowOOMError(e.what(), n * sizeof(T));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
|
||||||
|
return thrust_ptr;
|
||||||
|
}
|
||||||
|
void deallocate(pointer ptr, size_t n) { // NOLINT
|
||||||
|
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
||||||
|
if (use_cub_allocator_) {
|
||||||
|
GetGlobalCachingAllocator().DeviceFree(ptr.get());
|
||||||
|
} else {
|
||||||
|
SuperT::deallocate(ptr, n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
XGBCachingDeviceAllocatorImpl()
|
||||||
|
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
|
||||||
|
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
|
||||||
|
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||||
|
XGBOOST_DEVICE void construct(T *) {} // NOLINT
|
||||||
|
private:
|
||||||
|
bool use_cub_allocator_{true};
|
||||||
|
};
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
// Declare xgboost allocators
|
||||||
|
// Replacement of allocator with custom backend should occur here
|
||||||
|
template <typename T>
|
||||||
|
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
|
||||||
|
|
||||||
|
/** Be careful that the initialization constructor is a no-op, which means calling
|
||||||
|
* `vec.resize(n)` won't initialize the memory region to 0. Instead use
|
||||||
|
* `vec.resize(n, 0)`
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
|
||||||
|
|
||||||
|
/** @brief Specialisation of thrust device vector using custom allocator. */
|
||||||
|
template <typename T>
|
||||||
|
using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>; // NOLINT
|
||||||
|
template <typename T>
|
||||||
|
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>; // NOLINT
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_RMM)
|
||||||
|
/**
|
||||||
|
* @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead.
|
||||||
|
*/
|
||||||
|
class LoggingResource : public rmm::mr::device_memory_resource {
|
||||||
|
rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
|
||||||
|
|
||||||
|
public:
|
||||||
|
LoggingResource() = default;
|
||||||
|
~LoggingResource() override = default;
|
||||||
|
LoggingResource(LoggingResource const &) = delete;
|
||||||
|
LoggingResource &operator=(LoggingResource const &) = delete;
|
||||||
|
LoggingResource(LoggingResource &&) noexcept = default;
|
||||||
|
LoggingResource &operator=(LoggingResource &&) noexcept = default;
|
||||||
|
|
||||||
|
[[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept { // NOLINT
|
||||||
|
return mr_;
|
||||||
|
}
|
||||||
|
[[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept { // NOLINT
|
||||||
|
return mr_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override { // NOLINT
|
||||||
|
try {
|
||||||
|
auto const ptr = mr_->allocate(bytes, stream);
|
||||||
|
GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
|
||||||
|
return ptr;
|
||||||
|
} catch (rmm::bad_alloc const &e) {
|
||||||
|
detail::ThrowOOMError(e.what(), bytes);
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void do_deallocate(void *ptr, std::size_t bytes, // NOLINT
|
||||||
|
rmm::cuda_stream_view stream) override {
|
||||||
|
mr_->deallocate(ptr, bytes, stream);
|
||||||
|
GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] bool do_is_equal( // NOLINT
|
||||||
|
device_memory_resource const &other) const noexcept override {
|
||||||
|
if (this == &other) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
auto const *cast = dynamic_cast<LoggingResource const *>(&other);
|
||||||
|
if (cast == nullptr) {
|
||||||
|
return mr_->is_equal(other);
|
||||||
|
}
|
||||||
|
return get_upstream_resource() == cast->get_upstream_resource();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
LoggingResource *GlobalLoggingResource();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Container class that doesn't initialize the data.
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
class DeviceUVector : public rmm::device_uvector<T> {
|
||||||
|
using Super = rmm::device_uvector<T>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
|
||||||
|
|
||||||
|
void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
|
||||||
|
void Resize(std::size_t n, T const &v) {
|
||||||
|
auto orig = this->size();
|
||||||
|
Super::resize(n, rmm::cuda_stream_per_thread);
|
||||||
|
if (orig < n) {
|
||||||
|
thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// undefined private, cannot be accessed.
|
||||||
|
void resize(std::size_t n, rmm::cuda_stream_view stream); // NOLINT
|
||||||
|
};
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Without RMM, the initialization will happen.
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
|
||||||
|
using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void Resize(std::size_t n) { Super::resize(n); }
|
||||||
|
void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
// undefined private, cannot be accessed.
|
||||||
|
void resize(std::size_t n, T const &v = T{}); // NOLINT
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // defined(XGBOOST_USE_RMM)
|
||||||
|
} // namespace dh
|
||||||
@ -114,6 +114,11 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
|||||||
impl_->Vec().resize(new_size, v);
|
impl_->Vec().resize(new_size, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Resize(size_t new_size) {
|
||||||
|
impl_->Vec().resize(new_size, T{});
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void HostDeviceVector<T>::Fill(T v) {
|
void HostDeviceVector<T>::Fill(T v) {
|
||||||
std::fill(HostVector().begin(), HostVector().end(), v);
|
std::fill(HostVector().begin(), HostVector().end(), v);
|
||||||
|
|||||||
@ -1,16 +1,17 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2017-2023 by XGBoost contributors
|
* Copyright 2017-2024, XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <thrust/fill.h>
|
#include <thrust/fill.h>
|
||||||
#include <thrust/device_ptr.h>
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cstddef> // for size_t
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
|
#include "device_helpers.cuh"
|
||||||
|
#include "device_vector.cuh" // for DeviceUVector
|
||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
#include "xgboost/host_device_vector.h"
|
#include "xgboost/host_device_vector.h"
|
||||||
#include "xgboost/tree_model.h"
|
#include "xgboost/tree_model.h" // for RegTree
|
||||||
#include "device_helpers.cuh"
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|
||||||
@ -28,7 +29,7 @@ class HostDeviceVectorImpl {
|
|||||||
if (device.IsCUDA()) {
|
if (device.IsCUDA()) {
|
||||||
gpu_access_ = GPUAccess::kWrite;
|
gpu_access_ = GPUAccess::kWrite;
|
||||||
SetDevice();
|
SetDevice();
|
||||||
data_d_->resize(size, v);
|
data_d_->Resize(size, v);
|
||||||
} else {
|
} else {
|
||||||
data_h_.resize(size, v);
|
data_h_.resize(size, v);
|
||||||
}
|
}
|
||||||
@ -66,22 +67,22 @@ class HostDeviceVectorImpl {
|
|||||||
|
|
||||||
T* DevicePointer() {
|
T* DevicePointer() {
|
||||||
LazySyncDevice(GPUAccess::kWrite);
|
LazySyncDevice(GPUAccess::kWrite);
|
||||||
return data_d_->data().get();
|
return thrust::raw_pointer_cast(data_d_->data());
|
||||||
}
|
}
|
||||||
|
|
||||||
const T* ConstDevicePointer() {
|
const T* ConstDevicePointer() {
|
||||||
LazySyncDevice(GPUAccess::kRead);
|
LazySyncDevice(GPUAccess::kRead);
|
||||||
return data_d_->data().get();
|
return thrust::raw_pointer_cast(data_d_->data());
|
||||||
}
|
}
|
||||||
|
|
||||||
common::Span<T> DeviceSpan() {
|
common::Span<T> DeviceSpan() {
|
||||||
LazySyncDevice(GPUAccess::kWrite);
|
LazySyncDevice(GPUAccess::kWrite);
|
||||||
return {data_d_->data().get(), Size()};
|
return {this->DevicePointer(), Size()};
|
||||||
}
|
}
|
||||||
|
|
||||||
common::Span<const T> ConstDeviceSpan() {
|
common::Span<const T> ConstDeviceSpan() {
|
||||||
LazySyncDevice(GPUAccess::kRead);
|
LazySyncDevice(GPUAccess::kRead);
|
||||||
return {data_d_->data().get(), Size()};
|
return {this->ConstDevicePointer(), Size()};
|
||||||
}
|
}
|
||||||
|
|
||||||
void Fill(T v) { // NOLINT
|
void Fill(T v) { // NOLINT
|
||||||
@ -91,7 +92,7 @@ class HostDeviceVectorImpl {
|
|||||||
gpu_access_ = GPUAccess::kWrite;
|
gpu_access_ = GPUAccess::kWrite;
|
||||||
SetDevice();
|
SetDevice();
|
||||||
auto s_data = dh::ToSpan(*data_d_);
|
auto s_data = dh::ToSpan(*data_d_);
|
||||||
dh::LaunchN(data_d_->size(),
|
dh::LaunchN(data_d_->size(), dh::DefaultStream(),
|
||||||
[=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
|
[=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -128,7 +129,7 @@ class HostDeviceVectorImpl {
|
|||||||
|
|
||||||
void Extend(HostDeviceVectorImpl* other) {
|
void Extend(HostDeviceVectorImpl* other) {
|
||||||
auto ori_size = this->Size();
|
auto ori_size = this->Size();
|
||||||
this->Resize(ori_size + other->Size(), T());
|
this->Resize(ori_size + other->Size(), T{});
|
||||||
if (HostCanWrite() && other->HostCanRead()) {
|
if (HostCanWrite() && other->HostCanRead()) {
|
||||||
auto& h_vec = this->HostVector();
|
auto& h_vec = this->HostVector();
|
||||||
auto& other_vec = other->HostVector();
|
auto& other_vec = other->HostVector();
|
||||||
@ -138,10 +139,9 @@ class HostDeviceVectorImpl {
|
|||||||
auto ptr = other->ConstDevicePointer();
|
auto ptr = other->ConstDevicePointer();
|
||||||
SetDevice();
|
SetDevice();
|
||||||
CHECK_EQ(this->Device(), other->Device());
|
CHECK_EQ(this->Device(), other->Device());
|
||||||
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
|
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
|
||||||
ptr,
|
other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
|
||||||
other->Size() * sizeof(T),
|
dh::DefaultStream()));
|
||||||
cudaMemcpyDeviceToDevice));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,17 +171,22 @@ class HostDeviceVectorImpl {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Resize(size_t new_size, T v) {
|
template <typename... U>
|
||||||
if (new_size == Size()) { return; }
|
auto Resize(std::size_t new_size, U&&... args) {
|
||||||
|
if (new_size == Size()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
|
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
|
||||||
// fast on-device resize
|
// fast on-device resize
|
||||||
gpu_access_ = GPUAccess::kWrite;
|
gpu_access_ = GPUAccess::kWrite;
|
||||||
SetDevice();
|
SetDevice();
|
||||||
data_d_->resize(new_size, v);
|
auto old_size = data_d_->size();
|
||||||
|
data_d_->Resize(new_size, std::forward<U>(args)...);
|
||||||
} else {
|
} else {
|
||||||
// resize on host
|
// resize on host
|
||||||
LazySyncHost(GPUAccess::kNone);
|
LazySyncHost(GPUAccess::kNone);
|
||||||
data_h_.resize(new_size, v);
|
auto old_size = data_h_.size();
|
||||||
|
data_h_.resize(new_size, std::forward<U>(args)...);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,10 +200,8 @@ class HostDeviceVectorImpl {
|
|||||||
gpu_access_ = access;
|
gpu_access_ = access;
|
||||||
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
|
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
|
||||||
SetDevice();
|
SetDevice();
|
||||||
dh::safe_cuda(cudaMemcpy(data_h_.data(),
|
dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
|
||||||
data_d_->data().get(),
|
data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
|
||||||
data_d_->size() * sizeof(T),
|
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void LazySyncDevice(GPUAccess access) {
|
void LazySyncDevice(GPUAccess access) {
|
||||||
@ -211,10 +214,9 @@ class HostDeviceVectorImpl {
|
|||||||
// data is on the host
|
// data is on the host
|
||||||
LazyResizeDevice(data_h_.size());
|
LazyResizeDevice(data_h_.size());
|
||||||
SetDevice();
|
SetDevice();
|
||||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
|
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
|
||||||
data_h_.data(),
|
data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
|
||||||
data_d_->size() * sizeof(T),
|
dh::DefaultStream()));
|
||||||
cudaMemcpyHostToDevice));
|
|
||||||
gpu_access_ = access;
|
gpu_access_ = access;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -229,7 +231,7 @@ class HostDeviceVectorImpl {
|
|||||||
private:
|
private:
|
||||||
DeviceOrd device_{DeviceOrd::CPU()};
|
DeviceOrd device_{DeviceOrd::CPU()};
|
||||||
std::vector<T> data_h_{};
|
std::vector<T> data_h_{};
|
||||||
std::unique_ptr<dh::device_vector<T>> data_d_{};
|
std::unique_ptr<dh::DeviceUVector<T>> data_d_{};
|
||||||
GPUAccess gpu_access_{GPUAccess::kNone};
|
GPUAccess gpu_access_{GPUAccess::kNone};
|
||||||
|
|
||||||
void CopyToDevice(HostDeviceVectorImpl* other) {
|
void CopyToDevice(HostDeviceVectorImpl* other) {
|
||||||
@ -239,8 +241,10 @@ class HostDeviceVectorImpl {
|
|||||||
LazyResizeDevice(Size());
|
LazyResizeDevice(Size());
|
||||||
gpu_access_ = GPUAccess::kWrite;
|
gpu_access_ = GPUAccess::kWrite;
|
||||||
SetDevice();
|
SetDevice();
|
||||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
|
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
|
||||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
thrust::raw_pointer_cast(other->data_d_->data()),
|
||||||
|
data_d_->size() * sizeof(T), cudaMemcpyDefault,
|
||||||
|
dh::DefaultStream()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -248,14 +252,15 @@ class HostDeviceVectorImpl {
|
|||||||
LazyResizeDevice(Size());
|
LazyResizeDevice(Size());
|
||||||
gpu_access_ = GPUAccess::kWrite;
|
gpu_access_ = GPUAccess::kWrite;
|
||||||
SetDevice();
|
SetDevice();
|
||||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
|
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
|
||||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
data_d_->size() * sizeof(T), cudaMemcpyDefault,
|
||||||
|
dh::DefaultStream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
void LazyResizeDevice(size_t new_size) {
|
void LazyResizeDevice(size_t new_size) {
|
||||||
if (data_d_ && new_size == data_d_->size()) { return; }
|
if (data_d_ && new_size == data_d_->size()) { return; }
|
||||||
SetDevice();
|
SetDevice();
|
||||||
data_d_->resize(new_size);
|
data_d_->Resize(new_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetDevice() {
|
void SetDevice() {
|
||||||
@ -267,7 +272,7 @@ class HostDeviceVectorImpl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!data_d_) {
|
if (!data_d_) {
|
||||||
data_d_.reset(new dh::device_vector<T>);
|
data_d_.reset(new dh::DeviceUVector<T>{});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -397,7 +402,12 @@ void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
void HostDeviceVector<T>::Resize(std::size_t new_size) {
|
||||||
|
impl_->Resize(new_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Resize(std::size_t new_size, T v) {
|
||||||
impl_->Resize(new_size, v);
|
impl_->Resize(new_size, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -427,5 +437,4 @@ template class HostDeviceVector<RTreeNodeStat>;
|
|||||||
*/
|
*/
|
||||||
template class HostDeviceVector<std::size_t>;
|
template class HostDeviceVector<std::size_t>;
|
||||||
#endif // defined(__APPLE__)
|
#endif // defined(__APPLE__)
|
||||||
|
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -4,12 +4,14 @@
|
|||||||
#ifndef XGBOOST_COMMON_QUANTILE_CUH_
|
#ifndef XGBOOST_COMMON_QUANTILE_CUH_
|
||||||
#define XGBOOST_COMMON_QUANTILE_CUH_
|
#define XGBOOST_COMMON_QUANTILE_CUH_
|
||||||
|
|
||||||
#include "xgboost/span.h"
|
#include <thrust/logical.h> // for any_of
|
||||||
#include "xgboost/data.h"
|
|
||||||
|
#include "categorical.h"
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
#include "quantile.h"
|
#include "quantile.h"
|
||||||
#include "timer.h"
|
#include "timer.h"
|
||||||
#include "categorical.h"
|
#include "xgboost/data.h"
|
||||||
|
#include "xgboost/span.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
@ -100,9 +102,9 @@ class SketchContainer {
|
|||||||
CHECK(device.IsCUDA());
|
CHECK(device.IsCUDA());
|
||||||
// Initialize Sketches for this dmatrix
|
// Initialize Sketches for this dmatrix
|
||||||
this->columns_ptr_.SetDevice(device_);
|
this->columns_ptr_.SetDevice(device_);
|
||||||
this->columns_ptr_.Resize(num_columns + 1);
|
this->columns_ptr_.Resize(num_columns + 1, 0);
|
||||||
this->columns_ptr_b_.SetDevice(device_);
|
this->columns_ptr_b_.SetDevice(device_);
|
||||||
this->columns_ptr_b_.Resize(num_columns + 1);
|
this->columns_ptr_b_.Resize(num_columns + 1, 0);
|
||||||
|
|
||||||
this->feature_types_.Resize(feature_types.Size());
|
this->feature_types_.Resize(feature_types.Size());
|
||||||
this->feature_types_.Copy(feature_types);
|
this->feature_types_.Copy(feature_types);
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2021-2024, XGBoost Contributors
|
* Copyright 2021-2024, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#include <thrust/copy.h> // for copy
|
#include <thrust/copy.h> // for copy
|
||||||
|
#include <thrust/logical.h> // for any_of
|
||||||
#include <thrust/scan.h>
|
#include <thrust/scan.h>
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|||||||
@ -841,9 +841,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
out["hist_train_param"] = ToJson(hist_maker_param_);
|
out["hist_train_param"] = ToJson(hist_maker_param_);
|
||||||
}
|
}
|
||||||
|
|
||||||
~GPUHistMaker() { // NOLINT
|
~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }
|
||||||
dh::GlobalMemoryLogger().Log();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
|
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
|
||||||
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||||
|
|||||||
21
tests/cpp/common/test_device_vector.cu
Normal file
21
tests/cpp/common/test_device_vector.cu
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2024, XGBoost Contributors
|
||||||
|
*/
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include "../../../src/common/device_vector.cuh"
|
||||||
|
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
|
||||||
|
|
||||||
|
namespace dh {
|
||||||
|
TEST(DeviceUVector, Basic) {
|
||||||
|
GlobalMemoryLogger().Clear();
|
||||||
|
std::int32_t verbosity{3};
|
||||||
|
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
|
||||||
|
DeviceUVector<float> uvec;
|
||||||
|
uvec.Resize(12);
|
||||||
|
auto peak = GlobalMemoryLogger().PeakMemory();
|
||||||
|
auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
|
||||||
|
ASSERT_EQ(peak, n_bytes);
|
||||||
|
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
|
||||||
|
}
|
||||||
|
} // namespace dh
|
||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2018-2023 XGBoost contributors
|
* Copyright 2018-2024, XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <thrust/equal.h>
|
#include <thrust/equal.h>
|
||||||
@ -181,4 +181,41 @@ TEST(HostDeviceVector, Empty) {
|
|||||||
ASSERT_FALSE(another.Empty());
|
ASSERT_FALSE(another.Empty());
|
||||||
ASSERT_TRUE(vec.Empty());
|
ASSERT_TRUE(vec.Empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(HostDeviceVector, Resize) {
|
||||||
|
auto check = [&](HostDeviceVector<float> const& vec) {
|
||||||
|
auto const& h_vec = vec.ConstHostSpan();
|
||||||
|
for (std::size_t i = 0; i < 4; ++i) {
|
||||||
|
ASSERT_EQ(h_vec[i], i + 1);
|
||||||
|
}
|
||||||
|
for (std::size_t i = 4; i < vec.Size(); ++i) {
|
||||||
|
ASSERT_EQ(h_vec[i], 3.0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
{
|
||||||
|
HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
|
||||||
|
vec.SetDevice(DeviceOrd::CUDA(0));
|
||||||
|
vec.ConstDeviceSpan();
|
||||||
|
ASSERT_TRUE(vec.DeviceCanRead());
|
||||||
|
ASSERT_FALSE(vec.DeviceCanWrite());
|
||||||
|
vec.DeviceSpan();
|
||||||
|
vec.Resize(7, 3.0f);
|
||||||
|
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||||
|
check(vec);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)};
|
||||||
|
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||||
|
vec.Resize(7, 3.0f);
|
||||||
|
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||||
|
check(vec);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
|
||||||
|
ASSERT_TRUE(vec.HostCanWrite());
|
||||||
|
vec.Resize(7, 3.0f);
|
||||||
|
ASSERT_TRUE(vec.HostCanWrite());
|
||||||
|
check(vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace xgboost::common
|
} // namespace xgboost::common
|
||||||
|
|||||||
@ -1,15 +1,14 @@
|
|||||||
// Copyright (c) 2019 by Contributors
|
/**
|
||||||
|
* Copyright 2019-2024, XGBoost Contributors
|
||||||
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
#include <thrust/device_vector.h>
|
||||||
|
#include <thrust/execution_policy.h> // for device
|
||||||
|
#include <thrust/sequence.h> // for sequence
|
||||||
#include <xgboost/data.h>
|
#include <xgboost/data.h>
|
||||||
#include <xgboost/json.h>
|
#include <xgboost/json.h>
|
||||||
#include <thrust/device_vector.h>
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
#include "../../../src/common/bitfield.h"
|
|
||||||
#include "../../../src/common/device_helpers.cuh"
|
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
|
Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
|
||||||
thrust::device_vector<T>* out_d_data) {
|
thrust::device_vector<T>* out_d_data) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user