Avoid thrust vector initialization. (#10544)

* Avoid thrust vector initialization.

- Add a wrapper for rmm device uvector.
- Split up the `Resize` method for HDV.
This commit is contained in:
Jiaming Yuan 2024-07-11 17:29:27 +08:00 committed by GitHub
parent 89da9f9741
commit 1ca4bfd20e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 510 additions and 291 deletions

View File

@ -135,7 +135,9 @@ class HostDeviceVector {
void SetDevice(DeviceOrd device) const;
void Resize(size_t new_size, T v = T());
void Resize(std::size_t new_size);
/** @brief Resize and initialize the data if the new size is larger than the old size. */
void Resize(std::size_t new_size, T v);
using value_type = T; // NOLINT

View File

@ -18,7 +18,7 @@ struct CUDAContext {
* \brief Caching thrust policy.
*/
auto CTP() const {
#if THRUST_MAJOR_VERSION >= 2
#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
#else
return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());

View File

@ -1,26 +1,21 @@
/**
* Copyright 2017-2023 XGBoost contributors
* Copyright 2017-2024, XGBoost contributors
*/
#pragma once
#include <thrust/binary_search.h> // thrust::upper_bound
#include <thrust/device_malloc_allocator.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/binary_search.h> // thrust::upper_bound
#include <thrust/device_ptr.h> // for device_ptr
#include <thrust/device_vector.h> // for device_vector
#include <thrust/execution_policy.h> // thrust::seq
#include <thrust/gather.h> // gather
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/discard_iterator.h> // for discard_iterator
#include <thrust/iterator/transform_output_iterator.h> // make_transform_output_iterator
#include <thrust/logical.h>
#include <thrust/sequence.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#include <thrust/transform_scan.h>
#include <thrust/unique.h>
#include <algorithm>
#include <cstddef> // for size_t
#include <cub/cub.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_type.cuh> // for UnitWord
#include <sstream>
#include <string>
#include <tuple>
@ -28,22 +23,14 @@
#include "../collective/communicator-inl.h"
#include "common.h"
#include "device_vector.cuh"
#include "xgboost/host_device_vector.h"
#include "xgboost/logging.h"
#include "xgboost/span.h"
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#include "rmm/mr/device/per_device_resource.hpp"
#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
#include "rmm/version_config.hpp"
#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
#error "Please use RMM version 0.18 or later"
#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
#error "Please use RMM version 0.18 or later"
#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#if defined(XGBOOST_USE_RMM)
#include <rmm/exec_policy.hpp>
#endif // defined(XGBOOST_USE_RMM)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) {
LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
}
namespace detail {
/** \brief Keeps track of global device memory allocations. Thread safe.*/
class MemoryLogger {
// Information for a single device
struct DeviceStats {
size_t currently_allocated_bytes{ 0 };
size_t peak_allocated_bytes{ 0 };
size_t num_allocations{ 0 };
size_t num_deallocations{ 0 };
std::map<void *, size_t> device_allocations;
void RegisterAllocation(void *ptr, size_t n) {
device_allocations[ptr] = n;
currently_allocated_bytes += n;
peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
num_allocations++;
CHECK_GT(num_allocations, num_deallocations);
}
void RegisterDeallocation(void *ptr, size_t n, int current_device) {
auto itr = device_allocations.find(ptr);
if (itr == device_allocations.end()) {
LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
<< " that was never allocated\n"
<< dmlc::StackTrace();
} else {
num_deallocations++;
CHECK_LE(num_deallocations, num_allocations);
currently_allocated_bytes -= itr->second;
device_allocations.erase(itr);
}
}
};
DeviceStats stats_;
std::mutex mutex_;
public:
void RegisterAllocation(void *ptr, size_t n) {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
return;
}
std::lock_guard<std::mutex> guard(mutex_);
int current_device;
safe_cuda(cudaGetDevice(&current_device));
stats_.RegisterAllocation(ptr, n);
}
void RegisterDeallocation(void *ptr, size_t n) {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
return;
}
std::lock_guard<std::mutex> guard(mutex_);
int current_device;
safe_cuda(cudaGetDevice(&current_device));
stats_.RegisterDeallocation(ptr, n, current_device);
}
size_t PeakMemory() const {
return stats_.peak_allocated_bytes;
}
size_t CurrentlyAllocatedBytes() const {
return stats_.currently_allocated_bytes;
}
void Clear()
{
stats_ = DeviceStats();
}
void Log() {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
return;
}
std::lock_guard<std::mutex> guard(mutex_);
int current_device;
safe_cuda(cudaGetDevice(&current_device));
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
<< " ========";
LOG(CONSOLE) << "Peak memory usage: "
<< stats_.peak_allocated_bytes / 1048576 << "MiB";
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
}
};
} // namespace detail
inline detail::MemoryLogger &GlobalMemoryLogger() {
static detail::MemoryLogger memory_logger;
return memory_logger;
}
// dh::DebugSyncDevice(__FILE__, __LINE__);
inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
if (file != "" && line != -1) {
@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
safe_cuda(cudaGetLastError());
}
namespace detail {
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
template <typename T>
using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
template <typename T>
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
inline void ThrowOOMError(std::string const& err, size_t bytes) {
auto device = CurrentDevice();
auto rank = xgboost::collective::GetRank();
std::stringstream ss;
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
<< "- Free memory: " << AvailableMemory(device) << "\n"
<< "- Requested memory: " << bytes << std::endl;
LOG(FATAL) << ss.str();
}
/**
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
*/
template <class T>
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
using SuperT = XGBBaseDeviceAllocator<T>;
using pointer = thrust::device_ptr<T>; // NOLINT
template<typename U>
struct rebind // NOLINT
{
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
};
pointer allocate(size_t n) { // NOLINT
pointer ptr;
try {
ptr = SuperT::allocate(n);
dh::safe_cuda(cudaGetLastError());
} catch (const std::exception &e) {
ThrowOOMError(e.what(), n * sizeof(T));
}
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
return ptr;
}
void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
SuperT::deallocate(ptr, n);
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBDefaultDeviceAllocatorImpl()
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
};
/**
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
* RMM pool allocator is enabled. Does not initialise memory on construction.
*/
template <class T>
struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
using SuperT = XGBBaseDeviceAllocator<T>;
using pointer = thrust::device_ptr<T>; // NOLINT
template<typename U>
struct rebind // NOLINT
{
using other = XGBCachingDeviceAllocatorImpl<U>; // NOLINT
};
cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
// Configure allocator with maximum cached bin size of ~1GB and no limit on
// maximum cached bytes
thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
return *allocator;
}
pointer allocate(size_t n) { // NOLINT
pointer thrust_ptr;
if (use_cub_allocator_) {
T* raw_ptr{nullptr};
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
n * sizeof(T));
if (errc != cudaSuccess) {
ThrowOOMError("Caching allocator", n * sizeof(T));
}
thrust_ptr = pointer(raw_ptr);
} else {
try {
thrust_ptr = SuperT::allocate(n);
dh::safe_cuda(cudaGetLastError());
} catch (const std::exception &e) {
ThrowOOMError(e.what(), n * sizeof(T));
}
}
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
return thrust_ptr;
}
void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
if (use_cub_allocator_) {
GetGlobalCachingAllocator().DeviceFree(ptr.get());
} else {
SuperT::deallocate(ptr, n);
}
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBCachingDeviceAllocatorImpl()
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBOOST_DEVICE void construct(T *) {} // NOLINT
private:
bool use_cub_allocator_{true};
};
} // namespace detail
// Declare xgboost allocators
// Replacement of allocator with custom backend should occur here
template <typename T>
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
/*! Be careful that the initialization constructor is a no-op, which means calling
* `vec.resize(n)` won't initialize the memory region to 0. Instead use
* `vec.resize(n, 0)`*/
template <typename T>
using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
/** \brief Specialisation of thrust device vector using custom allocator. */
template <typename T>
using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>; // NOLINT
template <typename T>
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>; // NOLINT
// Faster to instantiate than caching_device_vector and invokes no synchronisation
// Use this where vector functionality (e.g. resize) is not required
template <typename T>
@ -734,6 +508,11 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
return ToSpan(vec, offset, size);
}
template <typename T>
xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
return {thrust::raw_pointer_cast(vec.data()), vec.size()};
}
// thrust begin, similiar to std::begin
template <typename T>
thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) { // NOLINT
@ -1117,6 +896,15 @@ class CUDAStream {
void Sync() { this->View().Sync(); }
};
inline auto CachingThrustPolicy() {
XGBCachingDeviceAllocator<char> alloc;
#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
return thrust::cuda::par_nosync(alloc).on(DefaultStream());
#else
return thrust::cuda::par(alloc).on(DefaultStream());
#endif // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
}
// Force nvcc to load data as constant
template <typename T>
class LDGIterator {

View File

@ -0,0 +1,27 @@
/**
* Copyright 2017-2024, XGBoost contributors
*/
#include "../collective/communicator-inl.h" // for GetRank
#include "device_helpers.cuh" // for CurrentDevice
#include "device_vector.cuh"
namespace dh {
namespace detail {
void ThrowOOMError(std::string const &err, size_t bytes) {
auto device = CurrentDevice();
auto rank = xgboost::collective::GetRank();
std::stringstream ss;
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
<< "- Free memory: " << dh::AvailableMemory(device) << "\n"
<< "- Requested memory: " << bytes << std::endl;
LOG(FATAL) << ss.str();
}
} // namespace detail
#if defined(XGBOOST_USE_RMM)
LoggingResource *GlobalLoggingResource() {
static auto mr{std::make_unique<LoggingResource>()};
return mr.get();
}
#endif // defined(XGBOOST_USE_RMM)
} // namespace dh

View File

@ -0,0 +1,330 @@
/**
* Copyright 2017-2024, XGBoost Contributors
*/
#pragma once
#include <thrust/device_malloc_allocator.h> // for device_malloc_allocator
#include <thrust/device_ptr.h> // for device_ptr
#include <thrust/device_vector.h> // for device_vector
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#include <rmm/device_uvector.hpp> // for device_uvector
#include <rmm/exec_policy.hpp> // for exec_policy_nosync
#include <rmm/mr/device/device_memory_resource.hpp> // for device_memory_resource
#include <rmm/mr/device/per_device_resource.hpp> // for get_current_device_resource
#include <rmm/mr/device/thrust_allocator_adaptor.hpp> // for thrust_allocator
#include <rmm/version_config.hpp> // for RMM_VERSION_MAJOR
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
#error "Please use RMM version 0.18 or later"
#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
#error "Please use RMM version 0.18 or later"
#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#include <cstddef> // for size_t
#include <cub/util_allocator.cuh> // for CachingDeviceAllocator
#include <cub/util_device.cuh> // for CurrentDevice
#include <map> // for map
#include <memory> // for unique_ptr
#include "common.h" // for safe_cuda
#include "xgboost/logging.h"
namespace dh {
namespace detail {
/** \brief Keeps track of global device memory allocations. Thread safe.*/
class MemoryLogger {
// Information for a single device
struct DeviceStats {
std::size_t currently_allocated_bytes{0};
size_t peak_allocated_bytes{0};
size_t num_allocations{0};
size_t num_deallocations{0};
std::map<void *, size_t> device_allocations;
void RegisterAllocation(void *ptr, size_t n) {
device_allocations[ptr] = n;
currently_allocated_bytes += n;
peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
num_allocations++;
CHECK_GT(num_allocations, num_deallocations);
}
void RegisterDeallocation(void *ptr, size_t n, int current_device) {
auto itr = device_allocations.find(ptr);
if (itr == device_allocations.end()) {
LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
<< " that was never allocated\n"
<< dmlc::StackTrace();
} else {
num_deallocations++;
CHECK_LE(num_deallocations, num_allocations);
currently_allocated_bytes -= itr->second;
device_allocations.erase(itr);
}
}
};
DeviceStats stats_;
std::mutex mutex_;
public:
void RegisterAllocation(void *ptr, size_t n) {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
return;
}
std::lock_guard<std::mutex> guard(mutex_);
stats_.RegisterAllocation(ptr, n);
}
void RegisterDeallocation(void *ptr, size_t n) {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
return;
}
std::lock_guard<std::mutex> guard(mutex_);
stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
}
size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; }
void Clear() { stats_ = DeviceStats(); }
void Log() {
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
return;
}
std::lock_guard<std::mutex> guard(mutex_);
int current_device;
dh::safe_cuda(cudaGetDevice(&current_device));
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
<< " ========";
LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
}
};
void ThrowOOMError(std::string const &err, size_t bytes);
} // namespace detail
inline detail::MemoryLogger &GlobalMemoryLogger() {
static detail::MemoryLogger memory_logger;
return memory_logger;
}
namespace detail {
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
template <typename T>
using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
template <typename T>
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
/**
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
*/
template <class T>
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
using SuperT = XGBBaseDeviceAllocator<T>;
using pointer = thrust::device_ptr<T>; // NOLINT
template <typename U>
struct rebind // NOLINT
{
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
};
pointer allocate(size_t n) { // NOLINT
pointer ptr;
try {
ptr = SuperT::allocate(n);
dh::safe_cuda(cudaGetLastError());
} catch (const std::exception &e) {
detail::ThrowOOMError(e.what(), n * sizeof(T));
}
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
return ptr;
}
void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
SuperT::deallocate(ptr, n);
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBDefaultDeviceAllocatorImpl()
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
};
/**
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
* RMM pool allocator is enabled. Does not initialise memory on construction.
*/
template <class T>
struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
using SuperT = XGBBaseDeviceAllocator<T>;
using pointer = thrust::device_ptr<T>; // NOLINT
template <typename U>
struct rebind // NOLINT
{
using other = XGBCachingDeviceAllocatorImpl<U>; // NOLINT
};
cub::CachingDeviceAllocator &GetGlobalCachingAllocator() {
// Configure allocator with maximum cached bin size of ~1GB and no limit on
// maximum cached bytes
thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
return *allocator;
}
pointer allocate(size_t n) { // NOLINT
pointer thrust_ptr;
if (use_cub_allocator_) {
T *raw_ptr{nullptr};
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
n * sizeof(T));
if (errc != cudaSuccess) {
detail::ThrowOOMError("Caching allocator", n * sizeof(T));
}
thrust_ptr = pointer(raw_ptr);
} else {
try {
thrust_ptr = SuperT::allocate(n);
dh::safe_cuda(cudaGetLastError());
} catch (const std::exception &e) {
detail::ThrowOOMError(e.what(), n * sizeof(T));
}
}
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
return thrust_ptr;
}
void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
if (use_cub_allocator_) {
GetGlobalCachingAllocator().DeviceFree(ptr.get());
} else {
SuperT::deallocate(ptr, n);
}
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBCachingDeviceAllocatorImpl()
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBOOST_DEVICE void construct(T *) {} // NOLINT
private:
bool use_cub_allocator_{true};
};
} // namespace detail
// Declare xgboost allocators
// Replacement of allocator with custom backend should occur here
template <typename T>
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
/** Be careful that the initialization constructor is a no-op, which means calling
* `vec.resize(n)` won't initialize the memory region to 0. Instead use
* `vec.resize(n, 0)`
*/
template <typename T>
using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
/** @brief Specialisation of thrust device vector using custom allocator. */
template <typename T>
using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>; // NOLINT
template <typename T>
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>; // NOLINT
#if defined(XGBOOST_USE_RMM)
/**
* @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead.
*/
class LoggingResource : public rmm::mr::device_memory_resource {
rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
public:
LoggingResource() = default;
~LoggingResource() override = default;
LoggingResource(LoggingResource const &) = delete;
LoggingResource &operator=(LoggingResource const &) = delete;
LoggingResource(LoggingResource &&) noexcept = default;
LoggingResource &operator=(LoggingResource &&) noexcept = default;
[[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept { // NOLINT
return mr_;
}
[[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept { // NOLINT
return mr_;
}
void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override { // NOLINT
try {
auto const ptr = mr_->allocate(bytes, stream);
GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
return ptr;
} catch (rmm::bad_alloc const &e) {
detail::ThrowOOMError(e.what(), bytes);
}
return nullptr;
}
void do_deallocate(void *ptr, std::size_t bytes, // NOLINT
rmm::cuda_stream_view stream) override {
mr_->deallocate(ptr, bytes, stream);
GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
}
[[nodiscard]] bool do_is_equal( // NOLINT
device_memory_resource const &other) const noexcept override {
if (this == &other) {
return true;
}
auto const *cast = dynamic_cast<LoggingResource const *>(&other);
if (cast == nullptr) {
return mr_->is_equal(other);
}
return get_upstream_resource() == cast->get_upstream_resource();
}
};
LoggingResource *GlobalLoggingResource();
/**
* @brief Container class that doesn't initialize the data.
*/
template <typename T>
class DeviceUVector : public rmm::device_uvector<T> {
using Super = rmm::device_uvector<T>;
public:
DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
void Resize(std::size_t n, T const &v) {
auto orig = this->size();
Super::resize(n, rmm::cuda_stream_per_thread);
if (orig < n) {
thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
}
}
private:
// undefined private, cannot be accessed.
void resize(std::size_t n, rmm::cuda_stream_view stream); // NOLINT
};
#else
/**
* @brief Without RMM, the initialization will happen.
*/
template <typename T>
class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
public:
void Resize(std::size_t n) { Super::resize(n); }
void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
private:
// undefined private, cannot be accessed.
void resize(std::size_t n, T const &v = T{}); // NOLINT
};
#endif // defined(XGBOOST_USE_RMM)
} // namespace dh

View File

@ -114,6 +114,11 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
impl_->Vec().resize(new_size, v);
}
template <typename T>
void HostDeviceVector<T>::Resize(size_t new_size) {
impl_->Vec().resize(new_size, T{});
}
template <typename T>
void HostDeviceVector<T>::Fill(T v) {
std::fill(HostVector().begin(), HostVector().end(), v);

View File

@ -1,16 +1,17 @@
/**
* Copyright 2017-2023 by XGBoost contributors
* Copyright 2017-2024, XGBoost contributors
*/
#include <thrust/fill.h>
#include <thrust/device_ptr.h>
#include <algorithm>
#include <cstddef> // for size_t
#include <cstdint>
#include "device_helpers.cuh"
#include "device_vector.cuh" // for DeviceUVector
#include "xgboost/data.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/tree_model.h"
#include "device_helpers.cuh"
#include "xgboost/tree_model.h" // for RegTree
namespace xgboost {
@ -28,7 +29,7 @@ class HostDeviceVectorImpl {
if (device.IsCUDA()) {
gpu_access_ = GPUAccess::kWrite;
SetDevice();
data_d_->resize(size, v);
data_d_->Resize(size, v);
} else {
data_h_.resize(size, v);
}
@ -66,22 +67,22 @@ class HostDeviceVectorImpl {
T* DevicePointer() {
LazySyncDevice(GPUAccess::kWrite);
return data_d_->data().get();
return thrust::raw_pointer_cast(data_d_->data());
}
const T* ConstDevicePointer() {
LazySyncDevice(GPUAccess::kRead);
return data_d_->data().get();
return thrust::raw_pointer_cast(data_d_->data());
}
common::Span<T> DeviceSpan() {
LazySyncDevice(GPUAccess::kWrite);
return {data_d_->data().get(), Size()};
return {this->DevicePointer(), Size()};
}
common::Span<const T> ConstDeviceSpan() {
LazySyncDevice(GPUAccess::kRead);
return {data_d_->data().get(), Size()};
return {this->ConstDevicePointer(), Size()};
}
void Fill(T v) { // NOLINT
@ -91,7 +92,7 @@ class HostDeviceVectorImpl {
gpu_access_ = GPUAccess::kWrite;
SetDevice();
auto s_data = dh::ToSpan(*data_d_);
dh::LaunchN(data_d_->size(),
dh::LaunchN(data_d_->size(), dh::DefaultStream(),
[=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
}
}
@ -128,7 +129,7 @@ class HostDeviceVectorImpl {
void Extend(HostDeviceVectorImpl* other) {
auto ori_size = this->Size();
this->Resize(ori_size + other->Size(), T());
this->Resize(ori_size + other->Size(), T{});
if (HostCanWrite() && other->HostCanRead()) {
auto& h_vec = this->HostVector();
auto& other_vec = other->HostVector();
@ -138,10 +139,9 @@ class HostDeviceVectorImpl {
auto ptr = other->ConstDevicePointer();
SetDevice();
CHECK_EQ(this->Device(), other->Device());
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
ptr,
other->Size() * sizeof(T),
cudaMemcpyDeviceToDevice));
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
dh::DefaultStream()));
}
}
@ -171,17 +171,22 @@ class HostDeviceVectorImpl {
}
}
void Resize(size_t new_size, T v) {
if (new_size == Size()) { return; }
template <typename... U>
auto Resize(std::size_t new_size, U&&... args) {
if (new_size == Size()) {
return;
}
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
// fast on-device resize
gpu_access_ = GPUAccess::kWrite;
SetDevice();
data_d_->resize(new_size, v);
auto old_size = data_d_->size();
data_d_->Resize(new_size, std::forward<U>(args)...);
} else {
// resize on host
LazySyncHost(GPUAccess::kNone);
data_h_.resize(new_size, v);
auto old_size = data_h_.size();
data_h_.resize(new_size, std::forward<U>(args)...);
}
}
@ -195,10 +200,8 @@ class HostDeviceVectorImpl {
gpu_access_ = access;
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
SetDevice();
dh::safe_cuda(cudaMemcpy(data_h_.data(),
data_d_->data().get(),
data_d_->size() * sizeof(T),
cudaMemcpyDeviceToHost));
dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
}
void LazySyncDevice(GPUAccess access) {
@ -211,10 +214,9 @@ class HostDeviceVectorImpl {
// data is on the host
LazyResizeDevice(data_h_.size());
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
data_h_.data(),
data_d_->size() * sizeof(T),
cudaMemcpyHostToDevice));
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
dh::DefaultStream()));
gpu_access_ = access;
}
@ -229,7 +231,7 @@ class HostDeviceVectorImpl {
private:
DeviceOrd device_{DeviceOrd::CPU()};
std::vector<T> data_h_{};
std::unique_ptr<dh::device_vector<T>> data_d_{};
std::unique_ptr<dh::DeviceUVector<T>> data_d_{};
GPUAccess gpu_access_{GPUAccess::kNone};
void CopyToDevice(HostDeviceVectorImpl* other) {
@ -239,8 +241,10 @@ class HostDeviceVectorImpl {
LazyResizeDevice(Size());
gpu_access_ = GPUAccess::kWrite;
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
data_d_->size() * sizeof(T), cudaMemcpyDefault));
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
thrust::raw_pointer_cast(other->data_d_->data()),
data_d_->size() * sizeof(T), cudaMemcpyDefault,
dh::DefaultStream()));
}
}
@ -248,14 +252,15 @@ class HostDeviceVectorImpl {
LazyResizeDevice(Size());
gpu_access_ = GPUAccess::kWrite;
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
data_d_->size() * sizeof(T), cudaMemcpyDefault));
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
data_d_->size() * sizeof(T), cudaMemcpyDefault,
dh::DefaultStream()));
}
void LazyResizeDevice(size_t new_size) {
if (data_d_ && new_size == data_d_->size()) { return; }
SetDevice();
data_d_->resize(new_size);
data_d_->Resize(new_size);
}
void SetDevice() {
@ -267,7 +272,7 @@ class HostDeviceVectorImpl {
}
if (!data_d_) {
data_d_.reset(new dh::device_vector<T>);
data_d_.reset(new dh::DeviceUVector<T>{});
}
}
};
@ -397,7 +402,12 @@ void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
}
template <typename T>
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
void HostDeviceVector<T>::Resize(std::size_t new_size) {
impl_->Resize(new_size);
}
template <typename T>
void HostDeviceVector<T>::Resize(std::size_t new_size, T v) {
impl_->Resize(new_size, v);
}
@ -427,5 +437,4 @@ template class HostDeviceVector<RTreeNodeStat>;
*/
template class HostDeviceVector<std::size_t>;
#endif // defined(__APPLE__)
} // namespace xgboost

View File

@ -4,12 +4,14 @@
#ifndef XGBOOST_COMMON_QUANTILE_CUH_
#define XGBOOST_COMMON_QUANTILE_CUH_
#include "xgboost/span.h"
#include "xgboost/data.h"
#include <thrust/logical.h> // for any_of
#include "categorical.h"
#include "device_helpers.cuh"
#include "quantile.h"
#include "timer.h"
#include "categorical.h"
#include "xgboost/data.h"
#include "xgboost/span.h"
namespace xgboost {
namespace common {
@ -100,9 +102,9 @@ class SketchContainer {
CHECK(device.IsCUDA());
// Initialize Sketches for this dmatrix
this->columns_ptr_.SetDevice(device_);
this->columns_ptr_.Resize(num_columns + 1);
this->columns_ptr_.Resize(num_columns + 1, 0);
this->columns_ptr_b_.SetDevice(device_);
this->columns_ptr_b_.Resize(num_columns + 1);
this->columns_ptr_b_.Resize(num_columns + 1, 0);
this->feature_types_.Resize(feature_types.Size());
this->feature_types_.Copy(feature_types);

View File

@ -1,7 +1,8 @@
/**
* Copyright 2021-2024, XGBoost Contributors
*/
#include <thrust/copy.h> // for copy
#include <thrust/copy.h> // for copy
#include <thrust/logical.h> // for any_of
#include <thrust/scan.h>
#include <cassert>

View File

@ -841,9 +841,7 @@ class GPUHistMaker : public TreeUpdater {
out["hist_train_param"] = ToJson(hist_maker_param_);
}
~GPUHistMaker() { // NOLINT
dh::GlobalMemoryLogger().Log();
}
~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
common::Span<HostDeviceVector<bst_node_t>> out_position,

View File

@ -0,0 +1,21 @@
/**
* Copyright 2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include "../../../src/common/device_vector.cuh"
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
namespace dh {
TEST(DeviceUVector, Basic) {
GlobalMemoryLogger().Clear();
std::int32_t verbosity{3};
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
DeviceUVector<float> uvec;
uvec.Resize(12);
auto peak = GlobalMemoryLogger().PeakMemory();
auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
ASSERT_EQ(peak, n_bytes);
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
}
} // namespace dh

View File

@ -1,5 +1,5 @@
/**
* Copyright 2018-2023 XGBoost contributors
* Copyright 2018-2024, XGBoost contributors
*/
#include <gtest/gtest.h>
#include <thrust/equal.h>
@ -181,4 +181,41 @@ TEST(HostDeviceVector, Empty) {
ASSERT_FALSE(another.Empty());
ASSERT_TRUE(vec.Empty());
}
TEST(HostDeviceVector, Resize) {
auto check = [&](HostDeviceVector<float> const& vec) {
auto const& h_vec = vec.ConstHostSpan();
for (std::size_t i = 0; i < 4; ++i) {
ASSERT_EQ(h_vec[i], i + 1);
}
for (std::size_t i = 4; i < vec.Size(); ++i) {
ASSERT_EQ(h_vec[i], 3.0);
}
};
{
HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
vec.SetDevice(DeviceOrd::CUDA(0));
vec.ConstDeviceSpan();
ASSERT_TRUE(vec.DeviceCanRead());
ASSERT_FALSE(vec.DeviceCanWrite());
vec.DeviceSpan();
vec.Resize(7, 3.0f);
ASSERT_TRUE(vec.DeviceCanWrite());
check(vec);
}
{
HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)};
ASSERT_TRUE(vec.DeviceCanWrite());
vec.Resize(7, 3.0f);
ASSERT_TRUE(vec.DeviceCanWrite());
check(vec);
}
{
HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
ASSERT_TRUE(vec.HostCanWrite());
vec.Resize(7, 3.0f);
ASSERT_TRUE(vec.HostCanWrite());
check(vec);
}
}
} // namespace xgboost::common

View File

@ -1,15 +1,14 @@
// Copyright (c) 2019 by Contributors
/**
* Copyright 2019-2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h> // for device
#include <thrust/sequence.h> // for sequence
#include <xgboost/data.h>
#include <xgboost/json.h>
#include <thrust/device_vector.h>
#include <memory>
#include "../../../src/common/bitfield.h"
#include "../../../src/common/device_helpers.cuh"
namespace xgboost {
template <typename T>
Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
thrust::device_vector<T>* out_d_data) {