Avoid thrust vector initialization. (#10544)
* Avoid thrust vector initialization. - Add a wrapper for rmm device uvector. - Split up the `Resize` method for HDV.
This commit is contained in:
parent
89da9f9741
commit
1ca4bfd20e
@ -135,7 +135,9 @@ class HostDeviceVector {
|
||||
|
||||
void SetDevice(DeviceOrd device) const;
|
||||
|
||||
void Resize(size_t new_size, T v = T());
|
||||
void Resize(std::size_t new_size);
|
||||
/** @brief Resize and initialize the data if the new size is larger than the old size. */
|
||||
void Resize(std::size_t new_size, T v);
|
||||
|
||||
using value_type = T; // NOLINT
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ struct CUDAContext {
|
||||
* \brief Caching thrust policy.
|
||||
*/
|
||||
auto CTP() const {
|
||||
#if THRUST_MAJOR_VERSION >= 2
|
||||
#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
|
||||
return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
|
||||
#else
|
||||
return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
|
||||
|
||||
@ -1,26 +1,21 @@
|
||||
/**
|
||||
* Copyright 2017-2023 XGBoost contributors
|
||||
* Copyright 2017-2024, XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <thrust/binary_search.h> // thrust::upper_bound
|
||||
#include <thrust/device_malloc_allocator.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/binary_search.h> // thrust::upper_bound
|
||||
#include <thrust/device_ptr.h> // for device_ptr
|
||||
#include <thrust/device_vector.h> // for device_vector
|
||||
#include <thrust/execution_policy.h> // thrust::seq
|
||||
#include <thrust/gather.h> // gather
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
#include <thrust/iterator/discard_iterator.h> // for discard_iterator
|
||||
#include <thrust/iterator/transform_output_iterator.h> // make_transform_output_iterator
|
||||
#include <thrust/logical.h>
|
||||
#include <thrust/sequence.h>
|
||||
#include <thrust/system/cuda/error.h>
|
||||
#include <thrust/system_error.h>
|
||||
#include <thrust/transform_scan.h>
|
||||
#include <thrust/unique.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef> // for size_t
|
||||
#include <cub/cub.cuh>
|
||||
#include <cub/util_allocator.cuh>
|
||||
#include <cub/util_type.cuh> // for UnitWord
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
@ -28,22 +23,14 @@
|
||||
|
||||
#include "../collective/communicator-inl.h"
|
||||
#include "common.h"
|
||||
#include "device_vector.cuh"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
#include "rmm/mr/device/per_device_resource.hpp"
|
||||
#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
|
||||
#include "rmm/version_config.hpp"
|
||||
|
||||
#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
|
||||
#error "Please use RMM version 0.18 or later"
|
||||
#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
|
||||
#error "Please use RMM version 0.18 or later"
|
||||
#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
|
||||
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
#if defined(XGBOOST_USE_RMM)
|
||||
#include <rmm/exec_policy.hpp>
|
||||
#endif // defined(XGBOOST_USE_RMM)
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
|
||||
|
||||
@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) {
|
||||
LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
/** \brief Keeps track of global device memory allocations. Thread safe.*/
|
||||
class MemoryLogger {
|
||||
// Information for a single device
|
||||
struct DeviceStats {
|
||||
size_t currently_allocated_bytes{ 0 };
|
||||
size_t peak_allocated_bytes{ 0 };
|
||||
size_t num_allocations{ 0 };
|
||||
size_t num_deallocations{ 0 };
|
||||
std::map<void *, size_t> device_allocations;
|
||||
void RegisterAllocation(void *ptr, size_t n) {
|
||||
device_allocations[ptr] = n;
|
||||
currently_allocated_bytes += n;
|
||||
peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
|
||||
num_allocations++;
|
||||
CHECK_GT(num_allocations, num_deallocations);
|
||||
}
|
||||
void RegisterDeallocation(void *ptr, size_t n, int current_device) {
|
||||
auto itr = device_allocations.find(ptr);
|
||||
if (itr == device_allocations.end()) {
|
||||
LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
|
||||
<< " that was never allocated\n"
|
||||
<< dmlc::StackTrace();
|
||||
} else {
|
||||
num_deallocations++;
|
||||
CHECK_LE(num_deallocations, num_allocations);
|
||||
currently_allocated_bytes -= itr->second;
|
||||
device_allocations.erase(itr);
|
||||
}
|
||||
}
|
||||
};
|
||||
DeviceStats stats_;
|
||||
std::mutex mutex_;
|
||||
|
||||
public:
|
||||
void RegisterAllocation(void *ptr, size_t n) {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_.RegisterAllocation(ptr, n);
|
||||
}
|
||||
void RegisterDeallocation(void *ptr, size_t n) {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
stats_.RegisterDeallocation(ptr, n, current_device);
|
||||
}
|
||||
size_t PeakMemory() const {
|
||||
return stats_.peak_allocated_bytes;
|
||||
}
|
||||
size_t CurrentlyAllocatedBytes() const {
|
||||
return stats_.currently_allocated_bytes;
|
||||
}
|
||||
void Clear()
|
||||
{
|
||||
stats_ = DeviceStats();
|
||||
}
|
||||
|
||||
void Log() {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
safe_cuda(cudaGetDevice(¤t_device));
|
||||
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
|
||||
<< " ========";
|
||||
LOG(CONSOLE) << "Peak memory usage: "
|
||||
<< stats_.peak_allocated_bytes / 1048576 << "MiB";
|
||||
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
|
||||
}
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
inline detail::MemoryLogger &GlobalMemoryLogger() {
|
||||
static detail::MemoryLogger memory_logger;
|
||||
return memory_logger;
|
||||
}
|
||||
|
||||
// dh::DebugSyncDevice(__FILE__, __LINE__);
|
||||
inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
|
||||
if (file != "" && line != -1) {
|
||||
@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
|
||||
safe_cuda(cudaGetLastError());
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
template <typename T>
|
||||
using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
|
||||
#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
template <typename T>
|
||||
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
|
||||
inline void ThrowOOMError(std::string const& err, size_t bytes) {
|
||||
auto device = CurrentDevice();
|
||||
auto rank = xgboost::collective::GetRank();
|
||||
std::stringstream ss;
|
||||
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
|
||||
<< "- Free memory: " << AvailableMemory(device) << "\n"
|
||||
<< "- Requested memory: " << bytes << std::endl;
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
|
||||
*/
|
||||
template <class T>
|
||||
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
||||
using SuperT = XGBBaseDeviceAllocator<T>;
|
||||
using pointer = thrust::device_ptr<T>; // NOLINT
|
||||
template<typename U>
|
||||
struct rebind // NOLINT
|
||||
{
|
||||
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
|
||||
};
|
||||
pointer allocate(size_t n) { // NOLINT
|
||||
pointer ptr;
|
||||
try {
|
||||
ptr = SuperT::allocate(n);
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
} catch (const std::exception &e) {
|
||||
ThrowOOMError(e.what(), n * sizeof(T));
|
||||
}
|
||||
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
|
||||
return ptr;
|
||||
}
|
||||
void deallocate(pointer ptr, size_t n) { // NOLINT
|
||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
||||
SuperT::deallocate(ptr, n);
|
||||
}
|
||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
XGBDefaultDeviceAllocatorImpl()
|
||||
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
|
||||
* RMM pool allocator is enabled. Does not initialise memory on construction.
|
||||
*/
|
||||
template <class T>
|
||||
struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
||||
using SuperT = XGBBaseDeviceAllocator<T>;
|
||||
using pointer = thrust::device_ptr<T>; // NOLINT
|
||||
template<typename U>
|
||||
struct rebind // NOLINT
|
||||
{
|
||||
using other = XGBCachingDeviceAllocatorImpl<U>; // NOLINT
|
||||
};
|
||||
cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
|
||||
// Configure allocator with maximum cached bin size of ~1GB and no limit on
|
||||
// maximum cached bytes
|
||||
thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
|
||||
std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
|
||||
return *allocator;
|
||||
}
|
||||
pointer allocate(size_t n) { // NOLINT
|
||||
pointer thrust_ptr;
|
||||
if (use_cub_allocator_) {
|
||||
T* raw_ptr{nullptr};
|
||||
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
|
||||
n * sizeof(T));
|
||||
if (errc != cudaSuccess) {
|
||||
ThrowOOMError("Caching allocator", n * sizeof(T));
|
||||
}
|
||||
thrust_ptr = pointer(raw_ptr);
|
||||
} else {
|
||||
try {
|
||||
thrust_ptr = SuperT::allocate(n);
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
} catch (const std::exception &e) {
|
||||
ThrowOOMError(e.what(), n * sizeof(T));
|
||||
}
|
||||
}
|
||||
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
|
||||
return thrust_ptr;
|
||||
}
|
||||
void deallocate(pointer ptr, size_t n) { // NOLINT
|
||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
||||
if (use_cub_allocator_) {
|
||||
GetGlobalCachingAllocator().DeviceFree(ptr.get());
|
||||
} else {
|
||||
SuperT::deallocate(ptr, n);
|
||||
}
|
||||
}
|
||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
XGBCachingDeviceAllocatorImpl()
|
||||
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
|
||||
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
XGBOOST_DEVICE void construct(T *) {} // NOLINT
|
||||
private:
|
||||
bool use_cub_allocator_{true};
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
// Declare xgboost allocators
|
||||
// Replacement of allocator with custom backend should occur here
|
||||
template <typename T>
|
||||
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
|
||||
/*! Be careful that the initialization constructor is a no-op, which means calling
|
||||
* `vec.resize(n)` won't initialize the memory region to 0. Instead use
|
||||
* `vec.resize(n, 0)`*/
|
||||
template <typename T>
|
||||
using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
|
||||
/** \brief Specialisation of thrust device vector using custom allocator. */
|
||||
template <typename T>
|
||||
using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>; // NOLINT
|
||||
template <typename T>
|
||||
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>; // NOLINT
|
||||
|
||||
// Faster to instantiate than caching_device_vector and invokes no synchronisation
|
||||
// Use this where vector functionality (e.g. resize) is not required
|
||||
template <typename T>
|
||||
@ -734,6 +508,11 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
|
||||
return ToSpan(vec, offset, size);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
|
||||
return {thrust::raw_pointer_cast(vec.data()), vec.size()};
|
||||
}
|
||||
|
||||
// thrust begin, similiar to std::begin
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) { // NOLINT
|
||||
@ -1117,6 +896,15 @@ class CUDAStream {
|
||||
void Sync() { this->View().Sync(); }
|
||||
};
|
||||
|
||||
inline auto CachingThrustPolicy() {
|
||||
XGBCachingDeviceAllocator<char> alloc;
|
||||
#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
|
||||
return thrust::cuda::par_nosync(alloc).on(DefaultStream());
|
||||
#else
|
||||
return thrust::cuda::par(alloc).on(DefaultStream());
|
||||
#endif // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
|
||||
}
|
||||
|
||||
// Force nvcc to load data as constant
|
||||
template <typename T>
|
||||
class LDGIterator {
|
||||
|
||||
27
src/common/device_vector.cu
Normal file
27
src/common/device_vector.cu
Normal file
@ -0,0 +1,27 @@
|
||||
/**
|
||||
* Copyright 2017-2024, XGBoost contributors
|
||||
*/
|
||||
#include "../collective/communicator-inl.h" // for GetRank
|
||||
#include "device_helpers.cuh" // for CurrentDevice
|
||||
#include "device_vector.cuh"
|
||||
|
||||
namespace dh {
|
||||
namespace detail {
|
||||
void ThrowOOMError(std::string const &err, size_t bytes) {
|
||||
auto device = CurrentDevice();
|
||||
auto rank = xgboost::collective::GetRank();
|
||||
std::stringstream ss;
|
||||
ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
|
||||
<< "- Free memory: " << dh::AvailableMemory(device) << "\n"
|
||||
<< "- Requested memory: " << bytes << std::endl;
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
#if defined(XGBOOST_USE_RMM)
|
||||
LoggingResource *GlobalLoggingResource() {
|
||||
static auto mr{std::make_unique<LoggingResource>()};
|
||||
return mr.get();
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_RMM)
|
||||
} // namespace dh
|
||||
330
src/common/device_vector.cuh
Normal file
330
src/common/device_vector.cuh
Normal file
@ -0,0 +1,330 @@
|
||||
/**
|
||||
* Copyright 2017-2024, XGBoost Contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <thrust/device_malloc_allocator.h> // for device_malloc_allocator
|
||||
#include <thrust/device_ptr.h> // for device_ptr
|
||||
#include <thrust/device_vector.h> // for device_vector
|
||||
|
||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
#include <rmm/device_uvector.hpp> // for device_uvector
|
||||
#include <rmm/exec_policy.hpp> // for exec_policy_nosync
|
||||
#include <rmm/mr/device/device_memory_resource.hpp> // for device_memory_resource
|
||||
#include <rmm/mr/device/per_device_resource.hpp> // for get_current_device_resource
|
||||
#include <rmm/mr/device/thrust_allocator_adaptor.hpp> // for thrust_allocator
|
||||
#include <rmm/version_config.hpp> // for RMM_VERSION_MAJOR
|
||||
|
||||
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
|
||||
|
||||
#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
|
||||
|
||||
#error "Please use RMM version 0.18 or later"
|
||||
#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
|
||||
#error "Please use RMM version 0.18 or later"
|
||||
#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
|
||||
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <cub/util_allocator.cuh> // for CachingDeviceAllocator
|
||||
#include <cub/util_device.cuh> // for CurrentDevice
|
||||
#include <map> // for map
|
||||
#include <memory> // for unique_ptr
|
||||
|
||||
#include "common.h" // for safe_cuda
|
||||
#include "xgboost/logging.h"
|
||||
|
||||
namespace dh {
|
||||
namespace detail {
|
||||
/** \brief Keeps track of global device memory allocations. Thread safe.*/
|
||||
class MemoryLogger {
|
||||
// Information for a single device
|
||||
struct DeviceStats {
|
||||
std::size_t currently_allocated_bytes{0};
|
||||
size_t peak_allocated_bytes{0};
|
||||
size_t num_allocations{0};
|
||||
size_t num_deallocations{0};
|
||||
std::map<void *, size_t> device_allocations;
|
||||
void RegisterAllocation(void *ptr, size_t n) {
|
||||
device_allocations[ptr] = n;
|
||||
currently_allocated_bytes += n;
|
||||
peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
|
||||
num_allocations++;
|
||||
CHECK_GT(num_allocations, num_deallocations);
|
||||
}
|
||||
void RegisterDeallocation(void *ptr, size_t n, int current_device) {
|
||||
auto itr = device_allocations.find(ptr);
|
||||
if (itr == device_allocations.end()) {
|
||||
LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
|
||||
<< " that was never allocated\n"
|
||||
<< dmlc::StackTrace();
|
||||
} else {
|
||||
num_deallocations++;
|
||||
CHECK_LE(num_deallocations, num_allocations);
|
||||
currently_allocated_bytes -= itr->second;
|
||||
device_allocations.erase(itr);
|
||||
}
|
||||
}
|
||||
};
|
||||
DeviceStats stats_;
|
||||
std::mutex mutex_;
|
||||
|
||||
public:
|
||||
void RegisterAllocation(void *ptr, size_t n) {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
stats_.RegisterAllocation(ptr, n);
|
||||
}
|
||||
void RegisterDeallocation(void *ptr, size_t n) {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
|
||||
}
|
||||
size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
|
||||
size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; }
|
||||
void Clear() { stats_ = DeviceStats(); }
|
||||
|
||||
void Log() {
|
||||
if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
int current_device;
|
||||
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||
LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
|
||||
<< " ========";
|
||||
LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
|
||||
LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
|
||||
}
|
||||
};
|
||||
|
||||
void ThrowOOMError(std::string const &err, size_t bytes);
|
||||
} // namespace detail
|
||||
|
||||
inline detail::MemoryLogger &GlobalMemoryLogger() {
|
||||
static detail::MemoryLogger memory_logger;
|
||||
return memory_logger;
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
template <typename T>
|
||||
using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
|
||||
#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
template <typename T>
|
||||
using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
|
||||
/**
|
||||
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
|
||||
*/
|
||||
template <class T>
|
||||
struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
||||
using SuperT = XGBBaseDeviceAllocator<T>;
|
||||
using pointer = thrust::device_ptr<T>; // NOLINT
|
||||
template <typename U>
|
||||
struct rebind // NOLINT
|
||||
{
|
||||
using other = XGBDefaultDeviceAllocatorImpl<U>; // NOLINT
|
||||
};
|
||||
pointer allocate(size_t n) { // NOLINT
|
||||
pointer ptr;
|
||||
try {
|
||||
ptr = SuperT::allocate(n);
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
} catch (const std::exception &e) {
|
||||
detail::ThrowOOMError(e.what(), n * sizeof(T));
|
||||
}
|
||||
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
|
||||
return ptr;
|
||||
}
|
||||
void deallocate(pointer ptr, size_t n) { // NOLINT
|
||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
||||
SuperT::deallocate(ptr, n);
|
||||
}
|
||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
XGBDefaultDeviceAllocatorImpl()
|
||||
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
|
||||
* RMM pool allocator is enabled. Does not initialise memory on construction.
|
||||
*/
|
||||
template <class T>
|
||||
struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
|
||||
using SuperT = XGBBaseDeviceAllocator<T>;
|
||||
using pointer = thrust::device_ptr<T>; // NOLINT
|
||||
template <typename U>
|
||||
struct rebind // NOLINT
|
||||
{
|
||||
using other = XGBCachingDeviceAllocatorImpl<U>; // NOLINT
|
||||
};
|
||||
cub::CachingDeviceAllocator &GetGlobalCachingAllocator() {
|
||||
// Configure allocator with maximum cached bin size of ~1GB and no limit on
|
||||
// maximum cached bytes
|
||||
thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
|
||||
std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
|
||||
return *allocator;
|
||||
}
|
||||
pointer allocate(size_t n) { // NOLINT
|
||||
pointer thrust_ptr;
|
||||
if (use_cub_allocator_) {
|
||||
T *raw_ptr{nullptr};
|
||||
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
|
||||
n * sizeof(T));
|
||||
if (errc != cudaSuccess) {
|
||||
detail::ThrowOOMError("Caching allocator", n * sizeof(T));
|
||||
}
|
||||
thrust_ptr = pointer(raw_ptr);
|
||||
} else {
|
||||
try {
|
||||
thrust_ptr = SuperT::allocate(n);
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
} catch (const std::exception &e) {
|
||||
detail::ThrowOOMError(e.what(), n * sizeof(T));
|
||||
}
|
||||
}
|
||||
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
|
||||
return thrust_ptr;
|
||||
}
|
||||
void deallocate(pointer ptr, size_t n) { // NOLINT
|
||||
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
|
||||
if (use_cub_allocator_) {
|
||||
GetGlobalCachingAllocator().DeviceFree(ptr.get());
|
||||
} else {
|
||||
SuperT::deallocate(ptr, n);
|
||||
}
|
||||
}
|
||||
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
XGBCachingDeviceAllocatorImpl()
|
||||
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
|
||||
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
XGBOOST_DEVICE void construct(T *) {} // NOLINT
|
||||
private:
|
||||
bool use_cub_allocator_{true};
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
// Declare xgboost allocators
|
||||
// Replacement of allocator with custom backend should occur here
|
||||
template <typename T>
|
||||
using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
|
||||
|
||||
/** Be careful that the initialization constructor is a no-op, which means calling
|
||||
* `vec.resize(n)` won't initialize the memory region to 0. Instead use
|
||||
* `vec.resize(n, 0)`
|
||||
*/
|
||||
template <typename T>
|
||||
using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
|
||||
|
||||
/** @brief Specialisation of thrust device vector using custom allocator. */
|
||||
template <typename T>
|
||||
using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>; // NOLINT
|
||||
template <typename T>
|
||||
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>; // NOLINT
|
||||
|
||||
#if defined(XGBOOST_USE_RMM)
|
||||
/**
|
||||
* @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead.
|
||||
*/
|
||||
class LoggingResource : public rmm::mr::device_memory_resource {
|
||||
rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
|
||||
|
||||
public:
|
||||
LoggingResource() = default;
|
||||
~LoggingResource() override = default;
|
||||
LoggingResource(LoggingResource const &) = delete;
|
||||
LoggingResource &operator=(LoggingResource const &) = delete;
|
||||
LoggingResource(LoggingResource &&) noexcept = default;
|
||||
LoggingResource &operator=(LoggingResource &&) noexcept = default;
|
||||
|
||||
[[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept { // NOLINT
|
||||
return mr_;
|
||||
}
|
||||
[[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept { // NOLINT
|
||||
return mr_;
|
||||
}
|
||||
|
||||
void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override { // NOLINT
|
||||
try {
|
||||
auto const ptr = mr_->allocate(bytes, stream);
|
||||
GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
|
||||
return ptr;
|
||||
} catch (rmm::bad_alloc const &e) {
|
||||
detail::ThrowOOMError(e.what(), bytes);
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void do_deallocate(void *ptr, std::size_t bytes, // NOLINT
|
||||
rmm::cuda_stream_view stream) override {
|
||||
mr_->deallocate(ptr, bytes, stream);
|
||||
GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
|
||||
}
|
||||
|
||||
[[nodiscard]] bool do_is_equal( // NOLINT
|
||||
device_memory_resource const &other) const noexcept override {
|
||||
if (this == &other) {
|
||||
return true;
|
||||
}
|
||||
auto const *cast = dynamic_cast<LoggingResource const *>(&other);
|
||||
if (cast == nullptr) {
|
||||
return mr_->is_equal(other);
|
||||
}
|
||||
return get_upstream_resource() == cast->get_upstream_resource();
|
||||
}
|
||||
};
|
||||
|
||||
LoggingResource *GlobalLoggingResource();
|
||||
|
||||
/**
|
||||
* @brief Container class that doesn't initialize the data.
|
||||
*/
|
||||
template <typename T>
|
||||
class DeviceUVector : public rmm::device_uvector<T> {
|
||||
using Super = rmm::device_uvector<T>;
|
||||
|
||||
public:
|
||||
DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
|
||||
|
||||
void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
|
||||
void Resize(std::size_t n, T const &v) {
|
||||
auto orig = this->size();
|
||||
Super::resize(n, rmm::cuda_stream_per_thread);
|
||||
if (orig < n) {
|
||||
thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// undefined private, cannot be accessed.
|
||||
void resize(std::size_t n, rmm::cuda_stream_view stream); // NOLINT
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
/**
|
||||
* @brief Without RMM, the initialization will happen.
|
||||
*/
|
||||
template <typename T>
|
||||
class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
|
||||
using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
|
||||
|
||||
public:
|
||||
void Resize(std::size_t n) { Super::resize(n); }
|
||||
void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
|
||||
|
||||
private:
|
||||
// undefined private, cannot be accessed.
|
||||
void resize(std::size_t n, T const &v = T{}); // NOLINT
|
||||
};
|
||||
|
||||
#endif // defined(XGBOOST_USE_RMM)
|
||||
} // namespace dh
|
||||
@ -114,6 +114,11 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
impl_->Vec().resize(new_size, v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Resize(size_t new_size) {
|
||||
impl_->Vec().resize(new_size, T{});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Fill(T v) {
|
||||
std::fill(HostVector().begin(), HostVector().end(), v);
|
||||
|
||||
@ -1,16 +1,17 @@
|
||||
/**
|
||||
* Copyright 2017-2023 by XGBoost contributors
|
||||
* Copyright 2017-2024, XGBoost contributors
|
||||
*/
|
||||
#include <thrust/fill.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint>
|
||||
|
||||
#include "device_helpers.cuh"
|
||||
#include "device_vector.cuh" // for DeviceUVector
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/tree_model.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "xgboost/tree_model.h" // for RegTree
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@ -28,7 +29,7 @@ class HostDeviceVectorImpl {
|
||||
if (device.IsCUDA()) {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
data_d_->resize(size, v);
|
||||
data_d_->Resize(size, v);
|
||||
} else {
|
||||
data_h_.resize(size, v);
|
||||
}
|
||||
@ -66,22 +67,22 @@ class HostDeviceVectorImpl {
|
||||
|
||||
T* DevicePointer() {
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
return data_d_->data().get();
|
||||
return thrust::raw_pointer_cast(data_d_->data());
|
||||
}
|
||||
|
||||
const T* ConstDevicePointer() {
|
||||
LazySyncDevice(GPUAccess::kRead);
|
||||
return data_d_->data().get();
|
||||
return thrust::raw_pointer_cast(data_d_->data());
|
||||
}
|
||||
|
||||
common::Span<T> DeviceSpan() {
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
return {data_d_->data().get(), Size()};
|
||||
return {this->DevicePointer(), Size()};
|
||||
}
|
||||
|
||||
common::Span<const T> ConstDeviceSpan() {
|
||||
LazySyncDevice(GPUAccess::kRead);
|
||||
return {data_d_->data().get(), Size()};
|
||||
return {this->ConstDevicePointer(), Size()};
|
||||
}
|
||||
|
||||
void Fill(T v) { // NOLINT
|
||||
@ -91,7 +92,7 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
auto s_data = dh::ToSpan(*data_d_);
|
||||
dh::LaunchN(data_d_->size(),
|
||||
dh::LaunchN(data_d_->size(), dh::DefaultStream(),
|
||||
[=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
|
||||
}
|
||||
}
|
||||
@ -128,7 +129,7 @@ class HostDeviceVectorImpl {
|
||||
|
||||
void Extend(HostDeviceVectorImpl* other) {
|
||||
auto ori_size = this->Size();
|
||||
this->Resize(ori_size + other->Size(), T());
|
||||
this->Resize(ori_size + other->Size(), T{});
|
||||
if (HostCanWrite() && other->HostCanRead()) {
|
||||
auto& h_vec = this->HostVector();
|
||||
auto& other_vec = other->HostVector();
|
||||
@ -138,10 +139,9 @@ class HostDeviceVectorImpl {
|
||||
auto ptr = other->ConstDevicePointer();
|
||||
SetDevice();
|
||||
CHECK_EQ(this->Device(), other->Device());
|
||||
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
|
||||
ptr,
|
||||
other->Size() * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
|
||||
other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
|
||||
dh::DefaultStream()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -171,17 +171,22 @@ class HostDeviceVectorImpl {
|
||||
}
|
||||
}
|
||||
|
||||
void Resize(size_t new_size, T v) {
|
||||
if (new_size == Size()) { return; }
|
||||
template <typename... U>
|
||||
auto Resize(std::size_t new_size, U&&... args) {
|
||||
if (new_size == Size()) {
|
||||
return;
|
||||
}
|
||||
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
|
||||
// fast on-device resize
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
data_d_->resize(new_size, v);
|
||||
auto old_size = data_d_->size();
|
||||
data_d_->Resize(new_size, std::forward<U>(args)...);
|
||||
} else {
|
||||
// resize on host
|
||||
LazySyncHost(GPUAccess::kNone);
|
||||
data_h_.resize(new_size, v);
|
||||
auto old_size = data_h_.size();
|
||||
data_h_.resize(new_size, std::forward<U>(args)...);
|
||||
}
|
||||
}
|
||||
|
||||
@ -195,10 +200,8 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_ = access;
|
||||
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(data_h_.data(),
|
||||
data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void LazySyncDevice(GPUAccess access) {
|
||||
@ -211,10 +214,9 @@ class HostDeviceVectorImpl {
|
||||
// data is on the host
|
||||
LazyResizeDevice(data_h_.size());
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
|
||||
data_h_.data(),
|
||||
data_d_->size() * sizeof(T),
|
||||
cudaMemcpyHostToDevice));
|
||||
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
|
||||
data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
|
||||
dh::DefaultStream()));
|
||||
gpu_access_ = access;
|
||||
}
|
||||
|
||||
@ -229,7 +231,7 @@ class HostDeviceVectorImpl {
|
||||
private:
|
||||
DeviceOrd device_{DeviceOrd::CPU()};
|
||||
std::vector<T> data_h_{};
|
||||
std::unique_ptr<dh::device_vector<T>> data_d_{};
|
||||
std::unique_ptr<dh::DeviceUVector<T>> data_d_{};
|
||||
GPUAccess gpu_access_{GPUAccess::kNone};
|
||||
|
||||
void CopyToDevice(HostDeviceVectorImpl* other) {
|
||||
@ -239,8 +241,10 @@ class HostDeviceVectorImpl {
|
||||
LazyResizeDevice(Size());
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
||||
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
|
||||
thrust::raw_pointer_cast(other->data_d_->data()),
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault,
|
||||
dh::DefaultStream()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -248,14 +252,15 @@ class HostDeviceVectorImpl {
|
||||
LazyResizeDevice(Size());
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
||||
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault,
|
||||
dh::DefaultStream()));
|
||||
}
|
||||
|
||||
void LazyResizeDevice(size_t new_size) {
|
||||
if (data_d_ && new_size == data_d_->size()) { return; }
|
||||
SetDevice();
|
||||
data_d_->resize(new_size);
|
||||
data_d_->Resize(new_size);
|
||||
}
|
||||
|
||||
void SetDevice() {
|
||||
@ -267,7 +272,7 @@ class HostDeviceVectorImpl {
|
||||
}
|
||||
|
||||
if (!data_d_) {
|
||||
data_d_.reset(new dh::device_vector<T>);
|
||||
data_d_.reset(new dh::DeviceUVector<T>{});
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -397,7 +402,12 @@ void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
void HostDeviceVector<T>::Resize(std::size_t new_size) {
|
||||
impl_->Resize(new_size);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::Resize(std::size_t new_size, T v) {
|
||||
impl_->Resize(new_size, v);
|
||||
}
|
||||
|
||||
@ -427,5 +437,4 @@ template class HostDeviceVector<RTreeNodeStat>;
|
||||
*/
|
||||
template class HostDeviceVector<std::size_t>;
|
||||
#endif // defined(__APPLE__)
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@ -4,12 +4,14 @@
|
||||
#ifndef XGBOOST_COMMON_QUANTILE_CUH_
|
||||
#define XGBOOST_COMMON_QUANTILE_CUH_
|
||||
|
||||
#include "xgboost/span.h"
|
||||
#include "xgboost/data.h"
|
||||
#include <thrust/logical.h> // for any_of
|
||||
|
||||
#include "categorical.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "quantile.h"
|
||||
#include "timer.h"
|
||||
#include "categorical.h"
|
||||
#include "xgboost/data.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@ -100,9 +102,9 @@ class SketchContainer {
|
||||
CHECK(device.IsCUDA());
|
||||
// Initialize Sketches for this dmatrix
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
this->columns_ptr_.Resize(num_columns + 1);
|
||||
this->columns_ptr_.Resize(num_columns + 1, 0);
|
||||
this->columns_ptr_b_.SetDevice(device_);
|
||||
this->columns_ptr_b_.Resize(num_columns + 1);
|
||||
this->columns_ptr_b_.Resize(num_columns + 1, 0);
|
||||
|
||||
this->feature_types_.Resize(feature_types.Size());
|
||||
this->feature_types_.Copy(feature_types);
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
/**
|
||||
* Copyright 2021-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/copy.h> // for copy
|
||||
#include <thrust/copy.h> // for copy
|
||||
#include <thrust/logical.h> // for any_of
|
||||
#include <thrust/scan.h>
|
||||
|
||||
#include <cassert>
|
||||
|
||||
@ -841,9 +841,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
out["hist_train_param"] = ToJson(hist_maker_param_);
|
||||
}
|
||||
|
||||
~GPUHistMaker() { // NOLINT
|
||||
dh::GlobalMemoryLogger().Log();
|
||||
}
|
||||
~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }
|
||||
|
||||
void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
|
||||
common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
|
||||
21
tests/cpp/common/test_device_vector.cu
Normal file
21
tests/cpp/common/test_device_vector.cu
Normal file
@ -0,0 +1,21 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "../../../src/common/device_vector.cuh"
|
||||
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
|
||||
|
||||
namespace dh {
|
||||
TEST(DeviceUVector, Basic) {
|
||||
GlobalMemoryLogger().Clear();
|
||||
std::int32_t verbosity{3};
|
||||
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
|
||||
DeviceUVector<float> uvec;
|
||||
uvec.Resize(12);
|
||||
auto peak = GlobalMemoryLogger().PeakMemory();
|
||||
auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
|
||||
ASSERT_EQ(peak, n_bytes);
|
||||
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
|
||||
}
|
||||
} // namespace dh
|
||||
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2018-2023 XGBoost contributors
|
||||
* Copyright 2018-2024, XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/equal.h>
|
||||
@ -181,4 +181,41 @@ TEST(HostDeviceVector, Empty) {
|
||||
ASSERT_FALSE(another.Empty());
|
||||
ASSERT_TRUE(vec.Empty());
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, Resize) {
|
||||
auto check = [&](HostDeviceVector<float> const& vec) {
|
||||
auto const& h_vec = vec.ConstHostSpan();
|
||||
for (std::size_t i = 0; i < 4; ++i) {
|
||||
ASSERT_EQ(h_vec[i], i + 1);
|
||||
}
|
||||
for (std::size_t i = 4; i < vec.Size(); ++i) {
|
||||
ASSERT_EQ(h_vec[i], 3.0);
|
||||
}
|
||||
};
|
||||
{
|
||||
HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
|
||||
vec.SetDevice(DeviceOrd::CUDA(0));
|
||||
vec.ConstDeviceSpan();
|
||||
ASSERT_TRUE(vec.DeviceCanRead());
|
||||
ASSERT_FALSE(vec.DeviceCanWrite());
|
||||
vec.DeviceSpan();
|
||||
vec.Resize(7, 3.0f);
|
||||
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||
check(vec);
|
||||
}
|
||||
{
|
||||
HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)};
|
||||
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||
vec.Resize(7, 3.0f);
|
||||
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||
check(vec);
|
||||
}
|
||||
{
|
||||
HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
|
||||
ASSERT_TRUE(vec.HostCanWrite());
|
||||
vec.Resize(7, 3.0f);
|
||||
ASSERT_TRUE(vec.HostCanWrite());
|
||||
check(vec);
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
|
||||
@ -1,15 +1,14 @@
|
||||
// Copyright (c) 2019 by Contributors
|
||||
/**
|
||||
* Copyright 2019-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/execution_policy.h> // for device
|
||||
#include <thrust/sequence.h> // for sequence
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/json.h>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include <memory>
|
||||
#include "../../../src/common/bitfield.h"
|
||||
#include "../../../src/common/device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
template <typename T>
|
||||
Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
|
||||
thrust::device_vector<T>* out_d_data) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user