diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu index a705751b1..bd428189f 100644 --- a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu +++ b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu @@ -132,7 +132,7 @@ class DataIteratorProxy { bool cache_on_host_{true}; // TODO(Bobby): Make this optional. template - using Alloc = xgboost::common::cuda::pinned_allocator; + using Alloc = xgboost::common::cuda_impl::pinned_allocator; template using HostVector = std::vector>; diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h index 6fe1757fd..c53ae4517 100644 --- a/src/common/cuda_pinned_allocator.h +++ b/src/common/cuda_pinned_allocator.h @@ -1,20 +1,19 @@ -/*! - * Copyright 2022 by XGBoost Contributors - * \file common.h - * \brief cuda pinned allocator for usage with thrust containers +/** + * Copyright 2022-2024, XGBoost Contributors + * + * @brief cuda pinned allocator for usage with thrust containers */ #pragma once -#include -#include +#include + +#include // for size_t +#include // for numeric_limits #include "common.h" -namespace xgboost { -namespace common { -namespace cuda { - +namespace xgboost::common::cuda_impl { // \p pinned_allocator is a CUDA-specific host memory allocator // that employs \c cudaMallocHost for allocation. // @@ -22,72 +21,94 @@ namespace cuda { // that Thrust used to provide. // // \see https://en.cppreference.com/w/cpp/memory/allocator -template -class pinned_allocator; - -template <> -class pinned_allocator { - public: - using value_type = void; // NOLINT: The type of the elements in the allocator - using pointer = void*; // NOLINT: The type returned by address() / allocate() - using const_pointer = const void*; // NOLINT: The type returned by address() - using size_type = std::size_t; // NOLINT: The type used for the size of the allocation - using difference_type = std::ptrdiff_t; // NOLINT: The type of the distance between two pointers - - template - struct rebind { // NOLINT - using other = pinned_allocator; // NOLINT: The rebound type - }; -}; - template -class pinned_allocator { - public: - using value_type = T; // NOLINT: The type of the elements in the allocator - using pointer = T*; // NOLINT: The type returned by address() / allocate() - using const_pointer = const T*; // NOLINT: The type returned by address() - using reference = T&; // NOLINT: The parameter type for address() - using const_reference = const T&; // NOLINT: The parameter type for address() - using size_type = std::size_t; // NOLINT: The type used for the size of the allocation - using difference_type = std::ptrdiff_t; // NOLINT: The type of the distance between two pointers +struct PinnedAllocPolicy { + using pointer = T*; // NOLINT: The type returned by address() / allocate() + using const_pointer = const T*; // NOLINT: The type returned by address() + using size_type = std::size_t; // NOLINT: The type used for the size of the allocation + using value_type = T; // NOLINT: The type of the elements in the allocator - template - struct rebind { // NOLINT - using other = pinned_allocator; // NOLINT: The rebound type - }; + size_type max_size() const { // NOLINT + return std::numeric_limits::max() / sizeof(value_type); + } - XGBOOST_DEVICE inline pinned_allocator() {}; // NOLINT: host/device markup ignored on defaulted functions - XGBOOST_DEVICE inline ~pinned_allocator() {} // NOLINT: host/device markup ignored on defaulted functions - XGBOOST_DEVICE inline pinned_allocator(pinned_allocator const&) {} // NOLINT: host/device markup ignored on defaulted functions - - pinned_allocator& operator=(pinned_allocator const& that) = default; - pinned_allocator& operator=(pinned_allocator&& that) = default; - - template - XGBOOST_DEVICE inline pinned_allocator(pinned_allocator const&) {} // NOLINT - - XGBOOST_DEVICE inline pointer address(reference r) { return &r; } // NOLINT - XGBOOST_DEVICE inline const_pointer address(const_reference r) { return &r; } // NOLINT - - inline pointer allocate(size_type cnt, const_pointer = nullptr) { // NOLINT - if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if + pointer allocate(size_type cnt, const_pointer = nullptr) { // NOLINT + if (cnt > this->max_size()) { + throw std::bad_alloc{}; + } // end if pointer result(nullptr); dh::safe_cuda(cudaMallocHost(reinterpret_cast(&result), cnt * sizeof(value_type))); return result; } - inline void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); } // NOLINT - - inline size_type max_size() const { return (std::numeric_limits::max)() / sizeof(T); } // NOLINT - - XGBOOST_DEVICE inline bool operator==(pinned_allocator const& x) const { return true; } - - XGBOOST_DEVICE inline bool operator!=(pinned_allocator const& x) const { - return !operator==(x); - } + void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFreeHost(p)); } // NOLINT }; -} // namespace cuda -} // namespace common -} // namespace xgboost + +template +struct ManagedAllocPolicy { + using pointer = T*; // NOLINT: The type returned by address() / allocate() + using const_pointer = const T*; // NOLINT: The type returned by address() + using size_type = std::size_t; // NOLINT: The type used for the size of the allocation + using value_type = T; // NOLINT: The type of the elements in the allocator + + size_type max_size() const { // NOLINT + return std::numeric_limits::max() / sizeof(value_type); + } + + pointer allocate(size_type cnt, const_pointer = nullptr) { // NOLINT + if (cnt > this->max_size()) { + throw std::bad_alloc{}; + } // end if + + pointer result(nullptr); + dh::safe_cuda(cudaMallocManaged(reinterpret_cast(&result), cnt * sizeof(value_type))); + return result; + } + + void deallocate(pointer p, size_type) { dh::safe_cuda(cudaFree(p)); } // NOLINT +}; + +template typename Policy> +class CudaHostAllocatorImpl : public Policy { // NOLINT + public: + using value_type = typename Policy::value_type; // NOLINT + using pointer = typename Policy::pointer; // NOLINT + using const_pointer = typename Policy::const_pointer; // NOLINT + using size_type = typename Policy::size_type; // NOLINT + + using reference = T&; // NOLINT: The parameter type for address() + using const_reference = const T&; // NOLINT: The parameter type for address() + + using difference_type = std::ptrdiff_t; // NOLINT: The type of the distance between two pointers + + template + struct rebind { // NOLINT + using other = CudaHostAllocatorImpl; // NOLINT: The rebound type + }; + + CudaHostAllocatorImpl() = default; + ~CudaHostAllocatorImpl() = default; + CudaHostAllocatorImpl(CudaHostAllocatorImpl const&) = default; + + CudaHostAllocatorImpl& operator=(CudaHostAllocatorImpl const& that) = default; + CudaHostAllocatorImpl& operator=(CudaHostAllocatorImpl&& that) = default; + + template + CudaHostAllocatorImpl(CudaHostAllocatorImpl const&) {} // NOLINT + + pointer address(reference r) { return &r; } // NOLINT + const_pointer address(const_reference r) { return &r; } // NOLINT + + bool operator==(CudaHostAllocatorImpl const& x) const { return true; } + + bool operator!=(CudaHostAllocatorImpl const& x) const { return !operator==(x); } +}; + +template +using pinned_allocator = CudaHostAllocatorImpl; // NOLINT + +template +using managed_allocator = CudaHostAllocatorImpl; // NOLINT +} // namespace xgboost::common::cuda_impl diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 2927d028c..342ac8da7 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -20,7 +20,7 @@ namespace xgboost::data { struct EllpackHostCache { - thrust::host_vector> cache; + thrust::host_vector> cache; void Resize(std::size_t n, dh::CUDAStreamView stream) { stream.Sync(); // Prevent partial copy inside resize. diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 8c387f632..e82bcbf82 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -57,7 +57,7 @@ struct CatAccessor { class GPUHistEvaluator { using CatST = common::CatBitField::value_type; // categorical storage type // use pinned memory to stage the categories, used for sort based splits. - using Alloc = xgboost::common::cuda::pinned_allocator; + using Alloc = xgboost::common::cuda_impl::pinned_allocator; private: TreeEvaluator tree_evaluator_; diff --git a/tests/cpp/common/test_cuda_host_allocator.cu b/tests/cpp/common/test_cuda_host_allocator.cu new file mode 100644 index 000000000..c8e25564a --- /dev/null +++ b/tests/cpp/common/test_cuda_host_allocator.cu @@ -0,0 +1,36 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#include +#include // for Context + +#include + +#include "../../../src/common/cuda_pinned_allocator.h" +#include "../../../src/common/device_helpers.cuh" // for DefaultStream +#include "../../../src/common/numeric.h" // for Iota + +namespace xgboost { +TEST(CudaHostMalloc, Pinned) { + std::vector> vec; + vec.resize(10); + ASSERT_EQ(vec.size(), 10); + Context ctx; + common::Iota(&ctx, vec.begin(), vec.end(), 0); + float k = 0; + for (auto v : vec) { + ASSERT_EQ(v, k); + ++k; + } +} + +TEST(CudaHostMalloc, Managed) { + std::vector> vec; + vec.resize(10); +#if defined(__linux__) + dh::safe_cuda( + cudaMemPrefetchAsync(vec.data(), vec.size() * sizeof(float), 0, dh::DefaultStream())); +#endif + dh::DefaultStream().Sync(); +} +} // namespace xgboost