diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc new file mode 100644 index 000000000..13f2516d4 --- /dev/null +++ b/src/common/cuda_dr_utils.cc @@ -0,0 +1,108 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#if defined(XGBOOST_USE_CUDA) +#include "cuda_dr_utils.h" + +#include // for max +#include // for int32_t +#include // for memset +#include // for make_unique +#include // for call_once +#include // for stringstream +#include // for string + +#include "common.h" // for safe_cuda +#include "cuda_rt_utils.h" // for CurrentDevice +#include "xgboost/string_view.h" // for StringVie + +namespace xgboost::cudr { +CuDriverApi::CuDriverApi() { + // similar to dlopen, but without the need to release a handle. + auto safe_load = [](xgboost::StringView name, auto **fnptr) { + cudaDriverEntryPointQueryResult status; + dh::safe_cuda(cudaGetDriverEntryPoint(name.c_str(), reinterpret_cast(fnptr), + cudaEnablePerThreadDefaultStream, &status)); + CHECK(status == cudaDriverEntryPointSuccess) << name; + CHECK(*fnptr); + }; + + safe_load("cuMemGetAllocationGranularity", &this->cuMemGetAllocationGranularity); + safe_load("cuMemCreate", &this->cuMemCreate); + safe_load("cuMemMap", &this->cuMemMap); + safe_load("cuMemAddressReserve", &this->cuMemAddressReserve); + safe_load("cuMemSetAccess", &this->cuMemSetAccess); + safe_load("cuMemUnmap", &this->cuMemUnmap); + safe_load("cuMemRelease", &this->cuMemRelease); + safe_load("cuMemAddressFree", &this->cuMemAddressFree); + safe_load("cuGetErrorString", &this->cuGetErrorString); + safe_load("cuGetErrorName", &this->cuGetErrorName); + safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute); + safe_load("cuDeviceGet", &this->cuDeviceGet); + + CHECK(this->cuMemGetAllocationGranularity); +} + +void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line, + char const *file) const { + if (status == CUDA_SUCCESS) { + return; + } + std::string cuerr{"CUDA driver error:"}; + + char const *name{nullptr}; + auto err0 = this->cuGetErrorName(status, &name); + if (err0 != CUDA_SUCCESS) { + LOG(WARNING) << cuerr << status << ". Then we failed to get error name:" << err0; + } + char const *msg{nullptr}; + auto err1 = this->cuGetErrorString(status, &msg); + if (err1 != CUDA_SUCCESS) { + LOG(WARNING) << cuerr << status << ". Then we failed to get error string:" << err1; + } + + std::stringstream ss; + ss << fn << "[" << file << ":" << line << "]:"; + if (name != nullptr && err0 == CUDA_SUCCESS) { + ss << cuerr << " " << name << "."; + } + if (msg != nullptr && err1 == CUDA_SUCCESS) { + ss << " " << msg << "\n"; + } + LOG(FATAL) << ss.str(); +} + +[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() { + static std::once_flag flag; + static std::unique_ptr cu; + std::call_once(flag, [&] { cu = std::make_unique(); }); + return *cu; +} + +void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) { + auto ordinal = curt::CurrentDevice(); + loc->type = type; + + if (type == CU_MEM_LOCATION_TYPE_DEVICE) { + loc->id = ordinal; + } else { + std::int32_t numa_id = -1; + CUdevice device; + safe_cu(GetGlobalCuDriverApi().cuDeviceGet(&device, ordinal)); + safe_cu(GetGlobalCuDriverApi().cuDeviceGetAttribute(&numa_id, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, + device)); + numa_id = std::max(numa_id, 0); + + loc->id = numa_id; + } +} + +[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type) { + CUmemAllocationProp prop; + std::memset(&prop, '\0', sizeof(prop)); + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + MakeCuMemLocation(type, &prop.location); + return prop; +} +} // namespace xgboost::cudr +#endif diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h new file mode 100644 index 000000000..ae0c9cef1 --- /dev/null +++ b/src/common/cuda_dr_utils.h @@ -0,0 +1,105 @@ +/** + * Copyright 2024, XGBoost contributors + * + * @brief Utility for CUDA driver API. + * + * XGBoost doesn't link libcuda.so at build time. The utilities here load the shared + * object at runtime. + */ +#pragma once + +#include +#include + +#include // for int32_t + +#include "xgboost/string_view.h" // for StringView + +namespace xgboost::cudr { +/** + * @brief A struct for retrieving CUDA driver API from the runtime API. + */ +struct CuDriverApi { + using Flags = unsigned long long; // NOLINT + + // Memroy manipulation functions. + using MemGetAllocationGranularityFn = CUresult(size_t *granularity, + const CUmemAllocationProp *prop, + CUmemAllocationGranularity_flags option); + using MemCreateFn = CUresult(CUmemGenericAllocationHandle *handle, size_t size, + const CUmemAllocationProp *prop, Flags flags); + using MemMapFn = CUresult(CUdeviceptr ptr, size_t size, size_t offset, + CUmemGenericAllocationHandle handle, Flags flags); + using MemAddressReserveFn = CUresult(CUdeviceptr *ptr, size_t size, size_t alignment, + CUdeviceptr addr, Flags flags); + using MemSetAccessFn = CUresult(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, + size_t count); + using MemUnmapFn = CUresult(CUdeviceptr ptr, size_t size); + using MemReleaseFn = CUresult(CUmemGenericAllocationHandle handle); + using MemAddressFreeFn = CUresult(CUdeviceptr ptr, size_t size); + // Error handling + using GetErrorString = CUresult(CUresult error, const char **pStr); + using GetErrorName = CUresult(CUresult error, const char **pStr); + // Device attributes + using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev); + using DeviceGet = CUresult(CUdevice *device, int ordinal); + + MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr}; // NOLINT + MemCreateFn *cuMemCreate{nullptr}; // NOLINT + /** + * @param[in] offset - Must be zero. + */ + MemMapFn *cuMemMap{nullptr}; // NOLINT + /** + * @param[out] ptr - Resulting pointer to start of virtual address range allocated + * @param[in] size - Size of the reserved virtual address range requested + * @param[in] alignment - Alignment of the reserved virtual address range requested + * @param[in] addr - Fixed starting address range requested + * @param[in] flags - Currently unused, must be zero + */ + MemAddressReserveFn *cuMemAddressReserve{nullptr}; // NOLINT + MemSetAccessFn *cuMemSetAccess{nullptr}; // NOLINT + MemUnmapFn *cuMemUnmap{nullptr}; // NOLINT + MemReleaseFn *cuMemRelease{nullptr}; // NOLINT + MemAddressFreeFn *cuMemAddressFree{nullptr}; // NOLINT + GetErrorString *cuGetErrorString{nullptr}; // NOLINT + GetErrorName *cuGetErrorName{nullptr}; // NOLINT + DeviceGetAttribute *cuDeviceGetAttribute{nullptr}; // NOLINT + DeviceGet *cuDeviceGet{nullptr}; // NOLINT + + CuDriverApi(); + + void ThrowIfError(CUresult status, StringView fn, std::int32_t line, char const *file) const; +}; + +[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi(); + +/** + * @brief Macro for guarding CUDA driver API calls. + */ +#define safe_cu(call) \ + do { \ + auto __status = (call); \ + if (__status != CUDA_SUCCESS) { \ + ::xgboost::cudr::GetGlobalCuDriverApi().ThrowIfError(__status, #call, __LINE__, __FILE__); \ + } \ + } while (0) + +// Get the allocation granularity. +inline auto GetAllocGranularity(CUmemAllocationProp const *prop) { + std::size_t granularity; + safe_cu(GetGlobalCuDriverApi().cuMemGetAllocationGranularity( + &granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + return granularity; +} + +/** + * @brief Obtain appropriate device ordinal for `CUmemLocation`. + */ +void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc); + +/** + * @brief Construct a `CUmemAllocationProp`. + */ +[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type); +} // namespace xgboost::cudr diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc index d41981d8f..53a4105dc 100644 --- a/src/common/cuda_rt_utils.cc +++ b/src/common/cuda_rt_utils.cc @@ -8,10 +8,11 @@ #endif // defined(XGBOOST_USE_CUDA) #include // for int32_t +#include // for once_flag, call_once #include "common.h" // for safe_cuda -namespace xgboost::common { +namespace xgboost::curt { #if defined(XGBOOST_USE_CUDA) std::int32_t AllVisibleGPUs() { int n_visgpus = 0; @@ -19,7 +20,7 @@ std::int32_t AllVisibleGPUs() { // When compiled with CUDA but running on CPU only device, // cudaGetDeviceCount will fail. dh::safe_cuda(cudaGetDeviceCount(&n_visgpus)); - } catch (const dmlc::Error &) { + } catch (const dmlc::Error&) { cudaGetLastError(); // reset error. return 0; } @@ -63,11 +64,36 @@ void SetDevice(std::int32_t device) { dh::safe_cuda(cudaSetDevice(device)); } } + +namespace { +template +void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) { + static std::int32_t version = 0; + static std::once_flag flag; + std::call_once(flag, [&] { fn(&version); }); + if (major) { + *major = version / 1000; + } + if (minor) { + *minor = version % 100 / 10; + } +} +} // namespace + +void RtVersion(std::int32_t* major, std::int32_t* minor) { + GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaRuntimeGetVersion(ver)); }, major, + minor); +} + +void DrVersion(std::int32_t* major, std::int32_t* minor) { + GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor); +} + #else std::int32_t AllVisibleGPUs() { return 0; } std::int32_t CurrentDevice() { - AssertGPUSupport(); + common::AssertGPUSupport(); return -1; } @@ -79,8 +105,8 @@ void CheckComputeCapability() {} void SetDevice(std::int32_t device) { if (device >= 0) { - AssertGPUSupport(); + common::AssertGPUSupport(); } } #endif // !defined(XGBOOST_USE_CUDA) -} // namespace xgboost::common +} // namespace xgboost::curt diff --git a/src/common/cuda_rt_utils.h b/src/common/cuda_rt_utils.h index 210f1e07d..0fac7e35e 100644 --- a/src/common/cuda_rt_utils.h +++ b/src/common/cuda_rt_utils.h @@ -8,7 +8,7 @@ #include #endif // defined(XGBOOST_USE_NVTX) -namespace xgboost::common { +namespace xgboost::curt { std::int32_t AllVisibleGPUs(); std::int32_t CurrentDevice(); @@ -24,6 +24,12 @@ void CheckComputeCapability(); void SetDevice(std::int32_t device); +// Returns the CUDA Runtime version. +void RtVersion(std::int32_t* major, std::int32_t* minor); + +// Returns the latest version of CUDA supported by the driver. +void DrVersion(std::int32_t* major, std::int32_t* minor); + struct NvtxDomain { static constexpr char const *name{"libxgboost"}; // NOLINT }; @@ -49,10 +55,10 @@ class NvtxRgb { explicit NvtxRgb(Args &&...) {} }; #endif // defined(XGBOOST_USE_NVTX) -} // namespace xgboost::common +} // namespace xgboost::curt #if defined(XGBOOST_USE_NVTX) -#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain) +#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::curt::NvtxDomain) #else #define xgboost_NVTX_FN_RANGE() #endif // defined(XGBOOST_USE_NVTX) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu new file mode 100644 index 000000000..608a535cd --- /dev/null +++ b/src/common/device_helpers.cu @@ -0,0 +1,23 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#include "cuda_rt_utils.h" // for RtVersion +#include "device_helpers.cuh" +#include "xgboost/windefs.h" // for xgboost_IS_WIN + +namespace dh { +PinnedMemory::PinnedMemory() { +#if defined(xgboost_IS_WIN) + this->impl_.emplace(); +#else + std::int32_t major{0}, minor{0}; + xgboost::curt::DrVersion(&major, &minor); + // Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable. + if (major >= 12 && minor >= 5) { + this->impl_.emplace(CU_MEM_LOCATION_TYPE_HOST_NUMA); + } else { + this->impl_.emplace(); + } +#endif +} +} // namespace dh diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index d3515b5b1..4d1115bc7 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -16,7 +16,8 @@ #include // for size_t #include #include // for UnitWord, DoubleBuffer -#include +#include // for variant, visit +#include // for vector #include "common.h" #include "device_vector.cuh" @@ -372,36 +373,25 @@ void CopyDeviceSpanToVector(std::vector *dst, xgboost::common::Span } // Keep track of pinned memory allocation -struct PinnedMemory { - void *temp_storage{nullptr}; - size_t temp_storage_bytes{0}; +class PinnedMemory { + std::variant impl_; - ~PinnedMemory() { Free(); } + public: + PinnedMemory(); template xgboost::common::Span GetSpan(size_t size) { - size_t num_bytes = size * sizeof(T); - if (num_bytes > temp_storage_bytes) { - Free(); - safe_cuda(cudaMallocHost(&temp_storage, num_bytes)); - temp_storage_bytes = num_bytes; - } - return xgboost::common::Span(static_cast(temp_storage), size); + return std::visit([&](auto &&alloc) { return alloc.template GetSpan(size); }, this->impl_); } - template - xgboost::common::Span GetSpan(size_t size, T init) { + xgboost::common::Span GetSpan(size_t size, T const &init) { auto result = this->GetSpan(size); - for (auto &e : result) { - e = init; - } + std::fill_n(result.data(), result.size(), init); return result; } - - void Free() { - if (temp_storage != nullptr) { - safe_cuda(cudaFreeHost(temp_storage)); - } + // Used for testing. + [[nodiscard]] bool IsVm() { + return std::get_if(&this->impl_) != nullptr; } }; diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu index 0cfa947ba..b7f300df6 100644 --- a/src/common/device_vector.cu +++ b/src/common/device_vector.cu @@ -1,10 +1,14 @@ /** * Copyright 2017-2024, XGBoost contributors */ +#include // for accumulate + #include "../collective/communicator-inl.h" // for GetRank #include "common.h" // for HumanMemUnit -#include "device_helpers.cuh" // for CurrentDevice +#include "cuda_dr_utils.h" +#include "device_helpers.cuh" // for CurrentDevice #include "device_vector.cuh" +#include "transform_iterator.h" // for MakeIndexTransformIter namespace dh { namespace detail { @@ -18,6 +22,79 @@ void ThrowOOMError(std::string const &err, std::size_t bytes) { << "- Requested memory: " << HumanMemUnit(bytes) << std::endl; LOG(FATAL) << ss.str(); } + +[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::PhyCapacity() const { + auto it = xgboost::common::MakeIndexTransformIter( + [&](std::size_t i) { return this->handles_[i]->size; }); + return std::accumulate(it, it + this->handles_.size(), static_cast(0)); +} + +void GrowOnlyVirtualMemVec::Reserve(std::size_t new_size) { + auto va_capacity = this->Capacity(); + if (new_size < va_capacity) { + return; + } + + // Try to reserve new virtual address. + auto const aligned_size = RoundUp(new_size, this->granularity_); + auto const new_reserve_size = aligned_size - va_capacity; + CUresult status = CUDA_SUCCESS; + auto hint = this->DevPtr() + va_capacity; + + bool failed{false}; + auto range = std::make_unique(new_reserve_size, hint, &status, &failed); + if (failed) { + // Failed to reserve the requested address. + // Slow path, try to reserve a new address with full size. + range = std::make_unique(aligned_size, 0ULL, &status, &failed); + safe_cu(status); + CHECK(!failed); + + // New allocation is successful. Map the pyhsical address to the virtual address. + // First unmap the existing ptr. + if (this->DevPtr() != 0) { + // Unmap the existing ptr. + safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity())); + + // Then remap all the existing physical addresses to the new ptr. + CUdeviceptr ptr = range->DevPtr(); + for (auto const &hdl : this->handles_) { + this->MapBlock(ptr, hdl); + ptr += hdl->size; + } + + // Release the existing ptr. + va_ranges_.clear(); + } + } + + va_ranges_.emplace_back(std::move(range)); +} + +GrowOnlyVirtualMemVec::GrowOnlyVirtualMemVec(CUmemLocationType type) + : prop_{xgboost::cudr::MakeAllocProp(type)}, + granularity_{xgboost::cudr::GetAllocGranularity(&this->prop_)} { + CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA); + // Assign the access descriptor + CUmemAccessDesc dacc; + dacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + xgboost::cudr::MakeCuMemLocation(CU_MEM_LOCATION_TYPE_DEVICE, &dacc.location); + this->access_desc_.push_back(dacc); + + if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { + CUmemAccessDesc hacc; + hacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + xgboost::cudr::MakeCuMemLocation(type, &hacc.location); + this->access_desc_.push_back(hacc); + } +} + +[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::Capacity() const { + auto it = xgboost::common::MakeIndexTransformIter( + [&](std::size_t i) { return this->va_ranges_[i]->Size(); }); + return std::accumulate(it, it + this->va_ranges_.size(), static_cast(0)); +} } // namespace detail #if defined(XGBOOST_USE_RMM) diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh index 004f0881d..6daa4f565 100644 --- a/src/common/device_vector.cuh +++ b/src/common/device_vector.cuh @@ -25,6 +25,8 @@ #endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#include + #include // for size_t #include // for CachingDeviceAllocator #include // for CurrentDevice @@ -32,8 +34,10 @@ #include // for unique_ptr #include // for defer_lock -#include "common.h" // for safe_cuda, HumanMemUnit +#include "common.h" // for safe_cuda, HumanMemUnit +#include "cuda_dr_utils.h" // for CuDriverApi #include "xgboost/logging.h" +#include "xgboost/span.h" // for Span namespace dh { namespace detail { @@ -127,6 +131,153 @@ class MemoryLogger { }; void ThrowOOMError(std::string const &err, std::size_t bytes); + +struct GrowOnlyPinnedMemoryImpl { + void *temp_storage{nullptr}; + size_t temp_storage_bytes{0}; + + ~GrowOnlyPinnedMemoryImpl() { Free(); } + + template + xgboost::common::Span GetSpan(size_t size) { + size_t num_bytes = size * sizeof(T); + if (num_bytes > temp_storage_bytes) { + Free(); + safe_cuda(cudaMallocHost(&temp_storage, num_bytes)); + temp_storage_bytes = num_bytes; + } + return xgboost::common::Span(static_cast(temp_storage), size); + } + + void Free() { + if (temp_storage != nullptr) { + safe_cuda(cudaFreeHost(temp_storage)); + } + } +}; + +/** + * @brief Use low-level virtual memory functions from CUDA driver API for grow-only memory + * allocation. + * + * @url https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/ + * + * Aside from the potential performance benefits, this is primarily implemented to prevent + * deadlock in NCCL and XGBoost. The host NUMA version requires CTK12.5+ to be stable. + */ +class GrowOnlyVirtualMemVec { + static auto RoundUp(std::size_t new_sz, std::size_t chunk_sz) { + return ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz; + } + + struct PhyAddrHandle { + CUmemGenericAllocationHandle handle; + std::size_t size; + }; + + class VaRange { + CUdeviceptr ptr_{0}; + std::size_t size_{0}; + + public: + VaRange(std::size_t size, CUdeviceptr hint, CUresult *p_status, bool *failed) : size_{size} { + CUresult &status = *p_status; + status = xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressReserve(&ptr_, size, 0, hint, 0); + *failed = status != CUDA_SUCCESS || (hint != 0 && ptr_ != hint); + } + ~VaRange() { + if (ptr_ != 0) { + xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressFree(ptr_, this->size_); + } + } + + VaRange(VaRange const &that) = delete; + VaRange &operator=(VaRange const &that) = delete; + + VaRange(VaRange &&that) { std::swap(*this, that); } + VaRange &operator=(VaRange &&that) { + std::swap(*this, that); + return *this; + } + [[nodiscard]] auto DevPtr() const { return this->ptr_; } + [[nodiscard]] std::size_t Size() const { return this->size_; } + }; + + using PhyHandle = std::unique_ptr>; + std::vector handles_; + std::vector> va_ranges_; + + xgboost::cudr::CuDriverApi &cu_{xgboost::cudr::GetGlobalCuDriverApi()}; + std::vector access_desc_; + CUmemAllocationProp const prop_; + + // Always use bytes. + std::size_t const granularity_; + + [[nodiscard]] std::size_t PhyCapacity() const; + [[nodiscard]] CUdeviceptr DevPtr() const { + if (this->va_ranges_.empty()) { + return 0; + } + return this->va_ranges_.front()->DevPtr(); + } + void MapBlock(CUdeviceptr ptr, PhyHandle const &hdl) const { + safe_cu(cu_.cuMemMap(ptr, hdl->size, 0, hdl->handle, 0)); + safe_cu(cu_.cuMemSetAccess(ptr, hdl->size, access_desc_.data(), access_desc_.size())); + } + auto CreatePhysicalMem(std::size_t size) const { + CUmemGenericAllocationHandle alloc_handle; + auto padded_size = RoundUp(size, this->granularity_); + CUresult status = this->cu_.cuMemCreate(&alloc_handle, padded_size, &this->prop_, 0); + CHECK_EQ(status, CUDA_SUCCESS); + return alloc_handle; + } + void Reserve(std::size_t new_size); + + public: + explicit GrowOnlyVirtualMemVec(CUmemLocationType type); + + void GrowTo(std::size_t n_bytes) { + auto alloc_size = this->PhyCapacity(); + if (n_bytes <= alloc_size) { + return; + } + + std::size_t delta = n_bytes - alloc_size; + auto const padded_delta = RoundUp(delta, this->granularity_); + this->Reserve(alloc_size + padded_delta); + + this->handles_.emplace_back( + std::unique_ptr>{ + new PhyAddrHandle{this->CreatePhysicalMem(padded_delta), padded_delta}, [&](auto *hdl) { + if (hdl) { + cu_.cuMemRelease(hdl->handle); + } + }}); + auto ptr = this->DevPtr() + alloc_size; + this->MapBlock(ptr, this->handles_.back()); + } + + template + xgboost::common::Span GetSpan(std::size_t size) { + size_t n_bytes = size * sizeof(T); + this->GrowTo(n_bytes); + return xgboost::common::Span(reinterpret_cast(this->DevPtr()), size); + } + + ~GrowOnlyVirtualMemVec() noexcept(false) { + if (this->DevPtr() != 0) { + safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity())); + } + + this->va_ranges_.clear(); // make sure all VA are freed before releasing the handles. + this->handles_.clear(); // release the handles + } + + [[nodiscard]] void *data() { return reinterpret_cast(this->DevPtr()); } // NOLINT + [[nodiscard]] std::size_t size() const { return this->PhyCapacity(); } // NOLINT + [[nodiscard]] std::size_t Capacity() const; +}; } // namespace detail inline detail::MemoryLogger &GlobalMemoryLogger() { diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index 475068053..66463ef2f 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -337,7 +337,7 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons int num_cuts_per_feature, bool is_ranking, float missing, size_t columns, size_t begin, size_t end, SketchContainer* sketch_container) { - SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); info.weights_.SetDevice(ctx->Device()); auto weights = info.weights_.ConstDeviceSpan(); diff --git a/src/common/quantile.cu b/src/common/quantile.cu index f2c7e4461..a638ed6b5 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -309,7 +309,7 @@ void MergeImpl(Context const *ctx, Span const &d_x, void SketchContainer::Push(Context const *ctx, Span entries, Span columns_ptr, common::Span cuts_ptr, size_t total_cuts, Span weights) { - common::SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); Span out; dh::device_vector cuts; bool first_window = this->Current().empty(); @@ -369,7 +369,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span entries, * pruning or merging. We preserve the first type and remove the second type. */ timer_.Start(__func__); - SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1); auto key_it = dh::MakeTransformIterator( @@ -408,7 +408,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span entries, void SketchContainer::Prune(Context const* ctx, std::size_t to) { timer_.Start(__func__); - SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); OffsetT to_total = 0; auto& h_columns_ptr = columns_ptr_b_.HostVector(); @@ -443,7 +443,7 @@ void SketchContainer::Prune(Context const* ctx, std::size_t to) { void SketchContainer::Merge(Context const *ctx, Span d_that_columns_ptr, Span that) { - SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); auto self = dh::ToSpan(this->Current()); LOG(DEBUG) << "Merge: self:" << HumanMemUnit(self.size_bytes()) << ". " << "That:" << HumanMemUnit(that.size_bytes()) << ". " @@ -507,7 +507,7 @@ void SketchContainer::FixError() { } void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) { - SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); auto world = collective::GetWorldSize(); if (world == 1 || is_column_split) { return; @@ -596,7 +596,7 @@ struct InvalidCatOp { void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) { timer_.Start(__func__); - SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); p_cuts->min_vals_.Resize(num_columns_); // Sync between workers. diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index 4d849540a..1b60670d0 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -206,7 +206,7 @@ class SketchContainer { template > size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to{}) { timer_.Start(__func__); - SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); this->columns_ptr_.SetDevice(ctx->Device()); Span d_column_scan = this->columns_ptr_.DeviceSpan(); CHECK_EQ(d_column_scan.size(), num_columns_ + 1); diff --git a/src/common/threading_utils.cc b/src/common/threading_utils.cc index f7296b7f9..0d943f94f 100644 --- a/src/common/threading_utils.cc +++ b/src/common/threading_utils.cc @@ -9,10 +9,12 @@ #include // for ifstream #include // for string -#include "common.h" // for DivRoundUp +#include "common.h" // for DivRoundUp #if defined(__linux__) #include +#include // for SYS_getcpu +#include // for syscall #endif namespace xgboost::common { @@ -118,6 +120,14 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) { return n_threads; } +[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) { +#ifdef SYS_getcpu + return syscall(SYS_getcpu, cpu, numa, NULL) == 0; +#else + return false; +#endif +} + void NameThread(std::thread* t, StringView name) { #if defined(__linux__) auto handle = t->native_handle(); diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h index e21400705..a4e2f21e4 100644 --- a/src/common/threading_utils.h +++ b/src/common/threading_utils.h @@ -306,10 +306,16 @@ class MemStackAllocator { }; /** - * \brief Constant that can be used for initializing static thread local memory. + * @brief Constant that can be used for initializing static thread local memory. */ std::int32_t constexpr DefaultMaxThreads() { return 128; } +/** + * @brief Get numa node on Linux. Other platforms are not supported. Returns false if the + * call fails. + */ +[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa); + /** * @brief Give the thread a name. Supports only pthread on linux. */ diff --git a/src/common/timer.cc b/src/common/timer.cc index 0b55d1623..a105f7a4a 100644 --- a/src/common/timer.cc +++ b/src/common/timer.cc @@ -18,7 +18,7 @@ void Monitor::Start(std::string const &name) { auto &stats = statistics_map_[name]; stats.timer.Start(); #if defined(XGBOOST_USE_NVTX) - auto range_handle = nvtx3::start_range_in(label_ + "::" + name); + auto range_handle = nvtx3::start_range_in(label_ + "::" + name); stats.nvtx_id = range_handle.get_value(); #endif // defined(XGBOOST_USE_NVTX) } @@ -30,7 +30,7 @@ void Monitor::Stop(const std::string &name) { stats.timer.Stop(); stats.count++; #if defined(XGBOOST_USE_NVTX) - nvtx3::end_range_in(nvtx3::range_handle{stats.nvtx_id}); + nvtx3::end_range_in(nvtx3::range_handle{stats.nvtx_id}); #endif // defined(XGBOOST_USE_NVTX) } } diff --git a/src/context.cc b/src/context.cc index 19060d5fc..5be8fcb0d 100644 --- a/src/context.cc +++ b/src/context.cc @@ -38,7 +38,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) { [[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) { // When booster is loaded from a memory image (Python pickle or R raw model), number of // available GPUs could be different. Wrap around it. - std::int32_t n_visible = common::AllVisibleGPUs(); + std::int32_t n_visible = curt::AllVisibleGPUs(); if (n_visible == 0) { if (device.IsCUDA()) { LOG(WARNING) << "No visible GPU is found, setting device to CPU."; @@ -55,7 +55,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) { } if (device.IsCUDA()) { - common::SetDevice(device.ordinal); + curt::SetDevice(device.ordinal); } return device; } diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index dc3f10c4e..f0c155701 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -139,7 +139,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, n_rows{n_rows}, n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); - common::SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); this->InitCompressedData(ctx); } @@ -154,7 +154,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, row_stride{row_stride}, n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); - common::SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); this->InitCompressedData(ctx); this->CreateHistIndices(ctx, page, feature_types); @@ -173,7 +173,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))}, n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); - common::SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); this->InitCompressedData(ctx); @@ -319,7 +319,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m bst_idx_t n_rows, std::shared_ptr cuts) : EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} { - common::SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); if (this->IsDense()) { CopyDataToEllpack(ctx, batch, feature_types, this, missing); diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu index 6949f263d..839966b08 100644 --- a/src/data/ellpack_page_raw_format.cu +++ b/src/data/ellpack_page_raw_format.cu @@ -85,7 +85,7 @@ template bytes += fo->Write(impl->is_dense); bytes += fo->Write(impl->row_stride); std::vector h_gidx_buffer; - Context ctx = Context{}.MakeCUDA(common::CurrentDevice()); + Context ctx = Context{}.MakeCUDA(curt::CurrentDevice()); [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer); bytes += common::WriteVec(fo, h_gidx_buffer); bytes += fo->Write(impl->base_rowid); diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 9b1de14cb..588ddccec 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -202,7 +202,7 @@ EllpackMmapStreamPolicy::CreateReader(StringVi */ template void EllpackPageSourceImpl::Fetch() { - common::SetDevice(this->Device().ordinal); + curt::SetDevice(this->Device().ordinal); if (!this->ReadCache()) { if (this->count_ != 0 && !this->sync_) { // source is initialized to be the 0th page during construction, so when count_ is 0 @@ -236,7 +236,7 @@ EllpackPageSourceImpl> */ template void ExtEllpackPageSourceImpl::Fetch() { - common::SetDevice(this->Device().ordinal); + curt::SetDevice(this->Device().ordinal); if (!this->ReadCache()) { auto iter = this->source_->Iter(); CHECK_EQ(this->count_, iter); diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 8d28b71d4..40f29b6b9 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -61,7 +61,7 @@ template class EllpackFormatPolicy { std::shared_ptr cuts_{nullptr}; DeviceOrd device_; - bool has_hmm_{common::SupportsPageableMem()}; + bool has_hmm_{curt::SupportsPageableMem()}; public: using FormatT = EllpackPageRawFormat; @@ -71,7 +71,7 @@ class EllpackFormatPolicy { StringView msg{" The overhead of iterating through external memory might be significant."}; if (!has_hmm_) { LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg; - } else if (!common::SupportsAts()) { + } else if (!curt::SupportsAts()) { LOG(WARNING) << "CUDA address translation service is not available." << msg; } #if !defined(XGBOOST_USE_RMM) @@ -121,7 +121,7 @@ class EllpackCacheStreamPolicy : public F { template typename F> class EllpackMmapStreamPolicy : public F { - bool has_hmm_{common::SupportsPageableMem()}; + bool has_hmm_{curt::SupportsPageableMem()}; public: using WriterT = common::AlignedFileWriteStream; diff --git a/src/data/quantile_dmatrix.cu b/src/data/quantile_dmatrix.cu index 605040ef0..b41ab046d 100644 --- a/src/data/quantile_dmatrix.cu +++ b/src/data/quantile_dmatrix.cu @@ -64,8 +64,8 @@ void MakeSketches(Context const* ctx, * Get the data shape. */ // We use do while here as the first batch is fetched in ctor - CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs()); - common::SetDevice(dh::GetDevice(ctx).ordinal); + CHECK_LT(ctx->Ordinal(), curt::AllVisibleGPUs()); + curt::SetDevice(dh::GetDevice(ctx).ordinal); if (ext_info.n_features == 0) { ext_info.n_features = data::BatchColumns(proxy); auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1), @@ -124,7 +124,7 @@ void MakeSketches(Context const* ctx, ext_info.base_rows.begin()); // Get reference - common::SetDevice(dh::GetDevice(ctx).ordinal); + curt::SetDevice(dh::GetDevice(ctx).ordinal); if (!ref) { HostDeviceVector ft; common::SketchContainer final_sketch( diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index 2d288fa9d..d9d48f00b 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -37,7 +37,7 @@ struct GBLinearTrainParam : public XGBoostParameter { size_t max_row_perbatch; void CheckGPUSupport() { - auto n_gpus = common::AllVisibleGPUs(); + auto n_gpus = curt::AllVisibleGPUs(); if (n_gpus == 0 && this->updater == "gpu_coord_descent") { common::AssertGPUSupport(); this->UpdateAllowUnknown(Args{{"updater", "coord_descent"}}); diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 80f319f46..5d016dfc7 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -105,7 +105,7 @@ void GBTree::Configure(Args const& cfg) { } cpu_predictor_->Configure(cfg); #if defined(XGBOOST_USE_CUDA) - auto n_gpus = common::AllVisibleGPUs(); + auto n_gpus = curt::AllVisibleGPUs(); if (!gpu_predictor_) { gpu_predictor_ = std::unique_ptr(Predictor::Create("gpu_predictor", this->ctx_)); } @@ -344,7 +344,7 @@ void GBTree::LoadConfig(Json const& in) { // This would cause all trees to be pushed to trees_to_update // e.g. updating a model, then saving and loading it would result in an empty model tparam_.process_type = TreeProcessType::kDefault; - std::int32_t const n_gpus = common::AllVisibleGPUs(); + std::int32_t const n_gpus = curt::AllVisibleGPUs(); auto msg = StringView{ R"( diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 115d30e7a..843e45568 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -482,7 +482,7 @@ void ExtractPaths(Context const* ctx, dh::device_vector>* paths, DeviceModel* model, dh::device_vector* path_categories, DeviceOrd device) { - common::SetDevice(device.ordinal); + curt::SetDevice(device.ordinal); auto& device_model = *model; dh::caching_device_vector info(device_model.nodes.Size()); @@ -937,7 +937,7 @@ class GPUPredictor : public xgboost::Predictor { : Predictor::Predictor{ctx}, column_split_helper_{ctx} {} ~GPUPredictor() override { - if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) { + if (ctx_->IsCUDA() && ctx_->Ordinal() < curt::AllVisibleGPUs()) { dh::safe_cuda(cudaSetDevice(ctx_->Ordinal())); } } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index a30f624fd..31b8d3496 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -184,7 +184,7 @@ struct GPUHistMakerDevice { // Reset values for each update iteration [[nodiscard]] DMatrix* Reset(HostDeviceVector const* dh_gpair, DMatrix* p_fmat) { this->monitor.Start(__func__); - common::SetDevice(ctx_->Ordinal()); + curt::SetDevice(ctx_->Ordinal()); auto const& info = p_fmat->Info(); @@ -789,7 +789,7 @@ class GPUHistMaker : public TreeUpdater { // Used in test to count how many configurations are performed LOG(DEBUG) << "[GPU Hist]: Configure"; hist_maker_param_.UpdateAllowUnknown(args); - common::CheckComputeCapability(); + curt::CheckComputeCapability(); initialised_ = false; monitor_.Init("updater_gpu_hist"); @@ -835,7 +835,7 @@ class GPUHistMaker : public TreeUpdater { ctx_, linalg::MakeVec(&column_sampling_seed, sizeof(column_sampling_seed)), 0)); this->column_sampler_ = std::make_shared(column_sampling_seed); - common::SetDevice(ctx_->Ordinal()); + curt::SetDevice(ctx_->Ordinal()); p_fmat->Info().feature_types.SetDevice(ctx_->Device()); std::vector batch_ptr; @@ -909,7 +909,7 @@ class GPUGlobalApproxMaker : public TreeUpdater { // Used in test to count how many configurations are performed LOG(DEBUG) << "[GPU Approx]: Configure"; hist_maker_param_.UpdateAllowUnknown(args); - common::CheckComputeCapability(); + curt::CheckComputeCapability(); initialised_ = false; monitor_.Init(this->Name()); diff --git a/tests/cpp/collective/test_allgather.cu b/tests/cpp/collective/test_allgather.cu index f145681da..d0c34cdc3 100644 --- a/tests/cpp/collective/test_allgather.cu +++ b/tests/cpp/collective/test_allgather.cu @@ -94,7 +94,7 @@ class MGPUAllgatherTest : public SocketTest {}; } // namespace TEST_F(MGPUAllgatherTest, MGPUTestVRing) { - auto n_workers = common::AllVisibleGPUs(); + auto n_workers = curt::AllVisibleGPUs(); TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { Worker w{host, port, timeout, n_workers, r}; @@ -105,7 +105,7 @@ TEST_F(MGPUAllgatherTest, MGPUTestVRing) { } TEST_F(MGPUAllgatherTest, MGPUTestVBcast) { - auto n_workers = common::AllVisibleGPUs(); + auto n_workers = curt::AllVisibleGPUs(); TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { Worker w{host, port, timeout, n_workers, r}; diff --git a/tests/cpp/collective/test_allreduce.cu b/tests/cpp/collective/test_allreduce.cu index 8bda1e0de..84d6a54db 100644 --- a/tests/cpp/collective/test_allreduce.cu +++ b/tests/cpp/collective/test_allreduce.cu @@ -5,7 +5,7 @@ #include #include // for host_vector -#include "../../../src/common/common.h" // for AllVisibleGPUs +#include "../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs #include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector #include "../../../src/common/type.h" // for EraseType #include "test_worker.cuh" // for NCCLWorkerForTest @@ -46,7 +46,7 @@ class Worker : public NCCLWorkerForTest { } // namespace TEST_F(MGPUAllreduceTest, BitOr) { - auto n_workers = common::AllVisibleGPUs(); + auto n_workers = curt::AllVisibleGPUs(); TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { Worker w{host, port, timeout, n_workers, r}; @@ -56,7 +56,7 @@ TEST_F(MGPUAllreduceTest, BitOr) { } TEST_F(MGPUAllreduceTest, Sum) { - auto n_workers = common::AllVisibleGPUs(); + auto n_workers = curt::AllVisibleGPUs(); TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { Worker w{host, port, timeout, n_workers, r}; diff --git a/tests/cpp/collective/test_comm_group.cc b/tests/cpp/collective/test_comm_group.cc index 3b1b5c5df..69fba60e7 100644 --- a/tests/cpp/collective/test_comm_group.cc +++ b/tests/cpp/collective/test_comm_group.cc @@ -37,7 +37,7 @@ TEST_F(CommGroupTest, Basic) { #if defined(XGBOOST_USE_NCCL) TEST_F(CommGroupTest, BasicGPU) { - std::int32_t n_workers = common::AllVisibleGPUs(); + std::int32_t n_workers = curt::AllVisibleGPUs(); TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) { auto ctx = MakeCUDACtx(r); diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h index 4f6dfc1ff..da79f8882 100644 --- a/tests/cpp/collective/test_worker.h +++ b/tests/cpp/collective/test_worker.h @@ -205,7 +205,7 @@ class BaseMGPUTest : public ::testing::Test { template auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated, [[maybe_unused]] bool emulate_if_single = false) const { - auto n_gpus = common::AllVisibleGPUs(); + auto n_gpus = curt::AllVisibleGPUs(); if (is_federated) { #if defined(XGBOOST_USE_FEDERATED) if (n_gpus == 1 && emulate_if_single) { diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index c6a8c0ab9..9dff9c691 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -3,6 +3,11 @@ */ #include +#include // for iota +#include // for sequence + +#include "../../../src/common/cuda_rt_utils.h" // for DrVersion +#include "../../../src/common/device_helpers.cuh" // for CachingThrustPolicy, PinnedMemory #include "../../../src/common/device_vector.cuh" #include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore @@ -18,4 +23,96 @@ TEST(DeviceUVector, Basic) { ASSERT_EQ(peak, n_bytes); std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); } + +#if defined(__linux__) +namespace { +class TestVirtualMem : public ::testing::TestWithParam { + public: + void Run() { + auto type = this->GetParam(); + detail::GrowOnlyVirtualMemVec vec{type}; + auto prop = xgboost::cudr::MakeAllocProp(type); + auto gran = xgboost::cudr::GetAllocGranularity(&prop); + ASSERT_GE(gran, 2); + auto data = vec.GetSpan(32); // should be smaller than granularity + ASSERT_EQ(data.size(), 32); + static_assert(std::is_same_v); + + std::vector h_data(data.size()); + auto check = [&] { + for (std::size_t i = 0; i < h_data.size(); ++i) { + ASSERT_EQ(h_data[i], i); + } + }; + auto fill = [&](std::int32_t n_orig, xgboost::common::Span data) { + if (type == CU_MEM_LOCATION_TYPE_DEVICE) { + thrust::sequence(dh::CachingThrustPolicy(), data.data() + n_orig, data.data() + data.size(), + n_orig); + dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault)); + } else { + std::iota(data.data() + n_orig, data.data() + data.size(), n_orig); + std::copy_n(data.data(), data.size(), h_data.data()); + } + }; + + fill(0, data); + check(); + + auto n_orig = data.size(); + // Should be smaller than granularity, use already reserved. + data = vec.GetSpan(128); + h_data.resize(data.size()); + fill(n_orig, data); + check(); + if (128 < gran) { + ASSERT_EQ(vec.Capacity(), gran); + } + + n_orig = data.size(); + data = vec.GetSpan(gran / 2); + h_data.resize(data.size()); + fill(n_orig, data); + check(); + ASSERT_EQ(vec.Capacity(), gran * 2); + + n_orig = data.size(); + data = vec.GetSpan(gran); + h_data.resize(data.size()); + fill(n_orig, data); + check(); + ASSERT_EQ(vec.Capacity(), gran * 4); + } +}; +} // anonymous namespace + +TEST_P(TestVirtualMem, Alloc) { this->Run(); } + +INSTANTIATE_TEST_SUITE_P( + Basic, TestVirtualMem, + ::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA), + [](::testing::TestParamInfo const& info) -> char const* { + auto type = info.param; + switch (type) { + case CU_MEM_LOCATION_TYPE_DEVICE: + return "Device"; + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + return "HostNuma"; + default: + LOG(FATAL) << "unreachable"; + } + return nullptr; + }); +#endif // defined(__linux__) + +TEST(TestVirtualMem, Version) { + std::int32_t major, minor; + xgboost::curt::DrVersion(&major, &minor); + LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; + PinnedMemory pinned; + if (major >= 12 && minor >= 5) { + ASSERT_TRUE(pinned.IsVm()); + } else { + ASSERT_FALSE(pinned.IsVm()); + } +} } // namespace dh diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index 508a0e0b1..6957fbb8e 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -578,7 +578,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) { namespace { auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) { - common::SetDevice(ctx->Ordinal()); + curt::SetDevice(ctx->Ordinal()); auto n = n_samples * n_features; std::vector x; x.resize(n); diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index c730390c3..65b8135bf 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -100,7 +100,7 @@ void CheckHost(HostDeviceVector *v, GPUAccess access) { } void TestHostDeviceVector(size_t n, DeviceOrd device) { - HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); + HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice); HostDeviceVector v; InitHostDeviceVector(n, device, &v); CheckDevice(&v, n, 0, GPUAccess::kRead); @@ -119,7 +119,7 @@ TEST(HostDeviceVector, Basic) { TEST(HostDeviceVector, Copy) { size_t n = 1001; auto device = DeviceOrd::CUDA(0); - HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); + HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice); HostDeviceVector v; { diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu index 4ac4f9c70..a26aaedb5 100644 --- a/tests/cpp/data/test_ellpack_page_raw_format.cu +++ b/tests/cpp/data/test_ellpack_page_raw_format.cu @@ -72,7 +72,7 @@ TEST_P(TestEllpackPageRawFormat, DiskIO) { } TEST_P(TestEllpackPageRawFormat, DiskIOHmm) { - if (common::SupportsPageableMem()) { + if (curt::SupportsPageableMem()) { EllpackMmapStreamPolicy policy{true}; this->Run(&policy, this->GetParam()); } else { diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 3dbf18970..78a6b3b03 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -655,7 +655,7 @@ class RMMAllocator { std::vector> cuda_mr; std::vector> pool_mr; int n_gpu; - RMMAllocator() : n_gpu(common::AllVisibleGPUs()) { + RMMAllocator() : n_gpu(curt::AllVisibleGPUs()) { int current_device; CHECK_EQ(cudaGetDevice(¤t_device), cudaSuccess); for (int i = 0; i < n_gpu; ++i) { @@ -697,5 +697,5 @@ void DeleteRMMResource(RMMAllocator*) {} RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; } #endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1 -std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); } +std::int32_t DistGpuIdx() { return curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); } } // namespace xgboost diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 8e4e82a91..7137d0d51 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -34,7 +34,7 @@ #endif #if defined(__CUDACC__) -#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()) +#define GPUIDX (curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()) #else #define GPUIDX (-1) #endif diff --git a/tests/cpp/metric/test_distributed_metric.cc b/tests/cpp/metric/test_distributed_metric.cc index 843ea5762..e1f50930b 100644 --- a/tests/cpp/metric/test_distributed_metric.cc +++ b/tests/cpp/metric/test_distributed_metric.cc @@ -47,7 +47,7 @@ class TestDistributedMetric : public ::testing::TestWithParam { std::int32_t n_workers{0}; if (device.IsCUDA()) { - n_workers = common::AllVisibleGPUs(); + n_workers = curt::AllVisibleGPUs(); } else { n_workers = std::min(static_cast(std::thread::hardware_concurrency()), 3); } diff --git a/tests/cpp/plugin/federated/test_federated_coll.cu b/tests/cpp/plugin/federated/test_federated_coll.cu index 31760a97f..67bf0ebc6 100644 --- a/tests/cpp/plugin/federated/test_federated_coll.cu +++ b/tests/cpp/plugin/federated/test_federated_coll.cu @@ -102,14 +102,14 @@ void TestAllgatherV(std::shared_ptr comm, std::int32_t rank) { } // namespace TEST_F(FederatedCollTestGPU, Allreduce) { - std::int32_t n_workers = common::AllVisibleGPUs(); + std::int32_t n_workers = curt::AllVisibleGPUs(); TestFederated(n_workers, [=](std::shared_ptr comm, std::int32_t rank) { TestAllreduce(comm, rank, n_workers); }); } TEST(FederatedCollGPUGlobal, Allreduce) { - std::int32_t n_workers = common::AllVisibleGPUs(); + std::int32_t n_workers = curt::AllVisibleGPUs(); TestFederatedGlobal(n_workers, [&] { auto r = collective::GetRank(); auto world = collective::GetWorldSize(); @@ -135,14 +135,14 @@ TEST(FederatedCollGPUGlobal, Allreduce) { } TEST_F(FederatedCollTestGPU, Broadcast) { - std::int32_t n_workers = common::AllVisibleGPUs(); + std::int32_t n_workers = curt::AllVisibleGPUs(); TestFederated(n_workers, [=](std::shared_ptr comm, std::int32_t rank) { TestBroadcast(comm, rank); }); } TEST_F(FederatedCollTestGPU, Allgather) { - std::int32_t n_workers = common::AllVisibleGPUs(); + std::int32_t n_workers = curt::AllVisibleGPUs(); TestFederated(n_workers, [=](std::shared_ptr comm, std::int32_t rank) { TestAllgather(comm, rank, n_workers); }); @@ -150,7 +150,7 @@ TEST_F(FederatedCollTestGPU, Allgather) { TEST_F(FederatedCollTestGPU, AllgatherV) { std::int32_t n_workers = 2; - if (common::AllVisibleGPUs() < n_workers) { + if (curt::AllVisibleGPUs() < n_workers) { GTEST_SKIP_("At least 2 GPUs are required for the test."); } TestFederated(n_workers, [=](std::shared_ptr comm, std::int32_t rank) { diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cc b/tests/cpp/plugin/federated/test_federated_comm_group.cc index 511b3d8d1..0b7cad440 100644 --- a/tests/cpp/plugin/federated/test_federated_comm_group.cc +++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc @@ -10,7 +10,7 @@ namespace xgboost::collective { TEST(CommGroup, Federated) { - std::int32_t n_workers = common::AllVisibleGPUs(); + std::int32_t n_workers = curt::AllVisibleGPUs(); TestFederatedGroup(n_workers, [&](std::shared_ptr comm_group, std::int32_t r) { Context ctx; ASSERT_EQ(comm_group->Rank(), r); diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.cu b/tests/cpp/plugin/federated/test_federated_comm_group.cu index c6fd8921c..3f289df37 100644 --- a/tests/cpp/plugin/federated/test_federated_comm_group.cu +++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu @@ -11,7 +11,7 @@ namespace xgboost::collective { TEST(CommGroup, FederatedGPU) { - std::int32_t n_workers = common::AllVisibleGPUs(); + std::int32_t n_workers = curt::AllVisibleGPUs(); TestFederatedGroup(n_workers, [&](std::shared_ptr comm_group, std::int32_t r) { Context ctx = MakeCUDACtx(0); auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CUDA(0)); diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 366d0ab6a..11c9d4946 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -299,7 +299,7 @@ TEST(GPUPredictor, IterationRange) { } TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) { - TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true); + TestIterationRangeColumnSplit(curt::AllVisibleGPUs(), true); } TEST(GPUPredictor, CategoricalPrediction) { @@ -312,7 +312,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) { } TEST(GPUPredictor, CategoricalPredictLeaf) { - auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); + auto ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); TestCategoricalPredictLeaf(&ctx, false); } @@ -358,7 +358,7 @@ TEST(GPUPredictor, Sparse) { } TEST_F(MGPUPredictorTest, SparseColumnSplit) { - TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2); - TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8); + TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.2); + TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.8); } } // namespace xgboost::predictor diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc index 1af873f58..17a1fd3c2 100644 --- a/tests/cpp/predictor/test_predictor.cc +++ b/tests/cpp/predictor/test_predictor.cc @@ -320,7 +320,7 @@ void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) { auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true); Context ctx; if (use_gpu) { - ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank); + ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank); } auto learner = LearnerForTest(&ctx, m_train, kIters); auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false); @@ -354,7 +354,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind, void TestCategoricalPrediction(bool use_gpu, bool is_column_split) { Context ctx; if (use_gpu) { - ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); + ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); } size_t constexpr kCols = 10; PredictionCacheEntry out_predictions; @@ -507,7 +507,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model, auto const rank = collective::GetRank(); Context ctx; if (use_gpu) { - ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank); + ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank); } auto n_threads = collective::GetWorkerLocalThreads(world_size); ctx.UpdateAllowUnknown( @@ -679,7 +679,7 @@ void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::siz std::vector const &expected_predt) { Context ctx; if (use_gpu) { - ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); + ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank()); } auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true); std::shared_ptr sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())}; diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu index 077698035..a2322d23b 100644 --- a/tests/cpp/test_context.cu +++ b/tests/cpp/test_context.cu @@ -30,7 +30,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) { TEST(Context, DeviceOrdinal) { Context ctx; - auto n_vis = common::AllVisibleGPUs(); + auto n_vis = curt::AllVisibleGPUs(); auto ord = n_vis - 1; std::string device = "cuda:" + std::to_string(ord); @@ -82,7 +82,7 @@ TEST(Context, GPUId) { ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}}); TestCUDA(ctx, 0); - auto n_vis = common::AllVisibleGPUs(); + auto n_vis = curt::AllVisibleGPUs(); auto ord = n_vis - 1; ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}}); TestCUDA(ctx, ord); diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 58e52e63d..a8551aa23 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -759,7 +759,7 @@ void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args auto world_size{3}; if (use_gpu) { - world_size = common::AllVisibleGPUs(); + world_size = curt::AllVisibleGPUs(); // Simulate MPU on a single GPU. Federated doesn't use nccl, can run multiple // instances on the same GPU. if (world_size == 1 && federated) { diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 968a6a411..7c2da9d24 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -595,7 +595,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) { } // anonymous namespace TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) { - if (common::AllVisibleGPUs() > 1) { + if (curt::AllVisibleGPUs() > 1) { // We can't emulate multiple GPUs with NCCL. this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, false, true); } @@ -603,7 +603,7 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) { } TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) { - if (common::AllVisibleGPUs() > 1) { + if (curt::AllVisibleGPUs() > 1) { // We can't emulate multiple GPUs with NCCL. this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, false, true); }