Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator.
- Define a driver API wrapper. Split up the runtime API wrapper.
This commit is contained in:
Jiaming Yuan 2024-09-28 04:26:44 +08:00 committed by GitHub
parent 13b9874fd6
commit 271f4a80e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
43 changed files with 702 additions and 103 deletions

108
src/common/cuda_dr_utils.cc Normal file
View File

@ -0,0 +1,108 @@
/**
* Copyright 2024, XGBoost contributors
*/
#if defined(XGBOOST_USE_CUDA)
#include "cuda_dr_utils.h"
#include <algorithm> // for max
#include <cstdint> // for int32_t
#include <cstring> // for memset
#include <memory> // for make_unique
#include <mutex> // for call_once
#include <sstream> // for stringstream
#include <string> // for string
#include "common.h" // for safe_cuda
#include "cuda_rt_utils.h" // for CurrentDevice
#include "xgboost/string_view.h" // for StringVie
namespace xgboost::cudr {
CuDriverApi::CuDriverApi() {
// similar to dlopen, but without the need to release a handle.
auto safe_load = [](xgboost::StringView name, auto **fnptr) {
cudaDriverEntryPointQueryResult status;
dh::safe_cuda(cudaGetDriverEntryPoint(name.c_str(), reinterpret_cast<void **>(fnptr),
cudaEnablePerThreadDefaultStream, &status));
CHECK(status == cudaDriverEntryPointSuccess) << name;
CHECK(*fnptr);
};
safe_load("cuMemGetAllocationGranularity", &this->cuMemGetAllocationGranularity);
safe_load("cuMemCreate", &this->cuMemCreate);
safe_load("cuMemMap", &this->cuMemMap);
safe_load("cuMemAddressReserve", &this->cuMemAddressReserve);
safe_load("cuMemSetAccess", &this->cuMemSetAccess);
safe_load("cuMemUnmap", &this->cuMemUnmap);
safe_load("cuMemRelease", &this->cuMemRelease);
safe_load("cuMemAddressFree", &this->cuMemAddressFree);
safe_load("cuGetErrorString", &this->cuGetErrorString);
safe_load("cuGetErrorName", &this->cuGetErrorName);
safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
safe_load("cuDeviceGet", &this->cuDeviceGet);
CHECK(this->cuMemGetAllocationGranularity);
}
void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line,
char const *file) const {
if (status == CUDA_SUCCESS) {
return;
}
std::string cuerr{"CUDA driver error:"};
char const *name{nullptr};
auto err0 = this->cuGetErrorName(status, &name);
if (err0 != CUDA_SUCCESS) {
LOG(WARNING) << cuerr << status << ". Then we failed to get error name:" << err0;
}
char const *msg{nullptr};
auto err1 = this->cuGetErrorString(status, &msg);
if (err1 != CUDA_SUCCESS) {
LOG(WARNING) << cuerr << status << ". Then we failed to get error string:" << err1;
}
std::stringstream ss;
ss << fn << "[" << file << ":" << line << "]:";
if (name != nullptr && err0 == CUDA_SUCCESS) {
ss << cuerr << " " << name << ".";
}
if (msg != nullptr && err1 == CUDA_SUCCESS) {
ss << " " << msg << "\n";
}
LOG(FATAL) << ss.str();
}
[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() {
static std::once_flag flag;
static std::unique_ptr<CuDriverApi> cu;
std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(); });
return *cu;
}
void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
auto ordinal = curt::CurrentDevice();
loc->type = type;
if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
loc->id = ordinal;
} else {
std::int32_t numa_id = -1;
CUdevice device;
safe_cu(GetGlobalCuDriverApi().cuDeviceGet(&device, ordinal));
safe_cu(GetGlobalCuDriverApi().cuDeviceGetAttribute(&numa_id, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID,
device));
numa_id = std::max(numa_id, 0);
loc->id = numa_id;
}
}
[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type) {
CUmemAllocationProp prop;
std::memset(&prop, '\0', sizeof(prop));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
MakeCuMemLocation(type, &prop.location);
return prop;
}
} // namespace xgboost::cudr
#endif

105
src/common/cuda_dr_utils.h Normal file
View File

@ -0,0 +1,105 @@
/**
* Copyright 2024, XGBoost contributors
*
* @brief Utility for CUDA driver API.
*
* XGBoost doesn't link libcuda.so at build time. The utilities here load the shared
* object at runtime.
*/
#pragma once
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cstdint> // for int32_t
#include "xgboost/string_view.h" // for StringView
namespace xgboost::cudr {
/**
* @brief A struct for retrieving CUDA driver API from the runtime API.
*/
struct CuDriverApi {
using Flags = unsigned long long; // NOLINT
// Memroy manipulation functions.
using MemGetAllocationGranularityFn = CUresult(size_t *granularity,
const CUmemAllocationProp *prop,
CUmemAllocationGranularity_flags option);
using MemCreateFn = CUresult(CUmemGenericAllocationHandle *handle, size_t size,
const CUmemAllocationProp *prop, Flags flags);
using MemMapFn = CUresult(CUdeviceptr ptr, size_t size, size_t offset,
CUmemGenericAllocationHandle handle, Flags flags);
using MemAddressReserveFn = CUresult(CUdeviceptr *ptr, size_t size, size_t alignment,
CUdeviceptr addr, Flags flags);
using MemSetAccessFn = CUresult(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc,
size_t count);
using MemUnmapFn = CUresult(CUdeviceptr ptr, size_t size);
using MemReleaseFn = CUresult(CUmemGenericAllocationHandle handle);
using MemAddressFreeFn = CUresult(CUdeviceptr ptr, size_t size);
// Error handling
using GetErrorString = CUresult(CUresult error, const char **pStr);
using GetErrorName = CUresult(CUresult error, const char **pStr);
// Device attributes
using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev);
using DeviceGet = CUresult(CUdevice *device, int ordinal);
MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr}; // NOLINT
MemCreateFn *cuMemCreate{nullptr}; // NOLINT
/**
* @param[in] offset - Must be zero.
*/
MemMapFn *cuMemMap{nullptr}; // NOLINT
/**
* @param[out] ptr - Resulting pointer to start of virtual address range allocated
* @param[in] size - Size of the reserved virtual address range requested
* @param[in] alignment - Alignment of the reserved virtual address range requested
* @param[in] addr - Fixed starting address range requested
* @param[in] flags - Currently unused, must be zero
*/
MemAddressReserveFn *cuMemAddressReserve{nullptr}; // NOLINT
MemSetAccessFn *cuMemSetAccess{nullptr}; // NOLINT
MemUnmapFn *cuMemUnmap{nullptr}; // NOLINT
MemReleaseFn *cuMemRelease{nullptr}; // NOLINT
MemAddressFreeFn *cuMemAddressFree{nullptr}; // NOLINT
GetErrorString *cuGetErrorString{nullptr}; // NOLINT
GetErrorName *cuGetErrorName{nullptr}; // NOLINT
DeviceGetAttribute *cuDeviceGetAttribute{nullptr}; // NOLINT
DeviceGet *cuDeviceGet{nullptr}; // NOLINT
CuDriverApi();
void ThrowIfError(CUresult status, StringView fn, std::int32_t line, char const *file) const;
};
[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi();
/**
* @brief Macro for guarding CUDA driver API calls.
*/
#define safe_cu(call) \
do { \
auto __status = (call); \
if (__status != CUDA_SUCCESS) { \
::xgboost::cudr::GetGlobalCuDriverApi().ThrowIfError(__status, #call, __LINE__, __FILE__); \
} \
} while (0)
// Get the allocation granularity.
inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
std::size_t granularity;
safe_cu(GetGlobalCuDriverApi().cuMemGetAllocationGranularity(
&granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
return granularity;
}
/**
* @brief Obtain appropriate device ordinal for `CUmemLocation`.
*/
void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
/**
* @brief Construct a `CUmemAllocationProp`.
*/
[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
} // namespace xgboost::cudr

View File

@ -8,10 +8,11 @@
#endif // defined(XGBOOST_USE_CUDA)
#include <cstdint> // for int32_t
#include <mutex> // for once_flag, call_once
#include "common.h" // for safe_cuda
namespace xgboost::common {
namespace xgboost::curt {
#if defined(XGBOOST_USE_CUDA)
std::int32_t AllVisibleGPUs() {
int n_visgpus = 0;
@ -19,7 +20,7 @@ std::int32_t AllVisibleGPUs() {
// When compiled with CUDA but running on CPU only device,
// cudaGetDeviceCount will fail.
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
} catch (const dmlc::Error &) {
} catch (const dmlc::Error&) {
cudaGetLastError(); // reset error.
return 0;
}
@ -63,11 +64,36 @@ void SetDevice(std::int32_t device) {
dh::safe_cuda(cudaSetDevice(device));
}
}
namespace {
template <typename Fn>
void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) {
static std::int32_t version = 0;
static std::once_flag flag;
std::call_once(flag, [&] { fn(&version); });
if (major) {
*major = version / 1000;
}
if (minor) {
*minor = version % 100 / 10;
}
}
} // namespace
void RtVersion(std::int32_t* major, std::int32_t* minor) {
GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaRuntimeGetVersion(ver)); }, major,
minor);
}
void DrVersion(std::int32_t* major, std::int32_t* minor) {
GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
}
#else
std::int32_t AllVisibleGPUs() { return 0; }
std::int32_t CurrentDevice() {
AssertGPUSupport();
common::AssertGPUSupport();
return -1;
}
@ -79,8 +105,8 @@ void CheckComputeCapability() {}
void SetDevice(std::int32_t device) {
if (device >= 0) {
AssertGPUSupport();
common::AssertGPUSupport();
}
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace xgboost::common
} // namespace xgboost::curt

View File

@ -8,7 +8,7 @@
#include <nvtx3/nvtx3.hpp>
#endif // defined(XGBOOST_USE_NVTX)
namespace xgboost::common {
namespace xgboost::curt {
std::int32_t AllVisibleGPUs();
std::int32_t CurrentDevice();
@ -24,6 +24,12 @@ void CheckComputeCapability();
void SetDevice(std::int32_t device);
// Returns the CUDA Runtime version.
void RtVersion(std::int32_t* major, std::int32_t* minor);
// Returns the latest version of CUDA supported by the driver.
void DrVersion(std::int32_t* major, std::int32_t* minor);
struct NvtxDomain {
static constexpr char const *name{"libxgboost"}; // NOLINT
};
@ -49,10 +55,10 @@ class NvtxRgb {
explicit NvtxRgb(Args &&...) {}
};
#endif // defined(XGBOOST_USE_NVTX)
} // namespace xgboost::common
} // namespace xgboost::curt
#if defined(XGBOOST_USE_NVTX)
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::curt::NvtxDomain)
#else
#define xgboost_NVTX_FN_RANGE()
#endif // defined(XGBOOST_USE_NVTX)

View File

@ -0,0 +1,23 @@
/**
* Copyright 2024, XGBoost contributors
*/
#include "cuda_rt_utils.h" // for RtVersion
#include "device_helpers.cuh"
#include "xgboost/windefs.h" // for xgboost_IS_WIN
namespace dh {
PinnedMemory::PinnedMemory() {
#if defined(xgboost_IS_WIN)
this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
#else
std::int32_t major{0}, minor{0};
xgboost::curt::DrVersion(&major, &minor);
// Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable.
if (major >= 12 && minor >= 5) {
this->impl_.emplace<detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_HOST_NUMA);
} else {
this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
}
#endif
}
} // namespace dh

View File

@ -16,7 +16,8 @@
#include <cstddef> // for size_t
#include <cub/cub.cuh>
#include <cub/util_type.cuh> // for UnitWord, DoubleBuffer
#include <vector>
#include <variant> // for variant, visit
#include <vector> // for vector
#include "common.h"
#include "device_vector.cuh"
@ -372,36 +373,25 @@ void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<const T>
}
// Keep track of pinned memory allocation
struct PinnedMemory {
void *temp_storage{nullptr};
size_t temp_storage_bytes{0};
class PinnedMemory {
std::variant<detail::GrowOnlyPinnedMemoryImpl, detail::GrowOnlyVirtualMemVec> impl_;
~PinnedMemory() { Free(); }
public:
PinnedMemory();
template <typename T>
xgboost::common::Span<T> GetSpan(size_t size) {
size_t num_bytes = size * sizeof(T);
if (num_bytes > temp_storage_bytes) {
Free();
safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
temp_storage_bytes = num_bytes;
}
return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
return std::visit([&](auto &&alloc) { return alloc.template GetSpan<T>(size); }, this->impl_);
}
template <typename T>
xgboost::common::Span<T> GetSpan(size_t size, T init) {
xgboost::common::Span<T> GetSpan(size_t size, T const &init) {
auto result = this->GetSpan<T>(size);
for (auto &e : result) {
e = init;
}
std::fill_n(result.data(), result.size(), init);
return result;
}
void Free() {
if (temp_storage != nullptr) {
safe_cuda(cudaFreeHost(temp_storage));
}
// Used for testing.
[[nodiscard]] bool IsVm() {
return std::get_if<detail::GrowOnlyVirtualMemVec>(&this->impl_) != nullptr;
}
};

View File

@ -1,10 +1,14 @@
/**
* Copyright 2017-2024, XGBoost contributors
*/
#include <numeric> // for accumulate
#include "../collective/communicator-inl.h" // for GetRank
#include "common.h" // for HumanMemUnit
#include "device_helpers.cuh" // for CurrentDevice
#include "cuda_dr_utils.h"
#include "device_helpers.cuh" // for CurrentDevice
#include "device_vector.cuh"
#include "transform_iterator.h" // for MakeIndexTransformIter
namespace dh {
namespace detail {
@ -18,6 +22,79 @@ void ThrowOOMError(std::string const &err, std::size_t bytes) {
<< "- Requested memory: " << HumanMemUnit(bytes) << std::endl;
LOG(FATAL) << ss.str();
}
[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::PhyCapacity() const {
auto it = xgboost::common::MakeIndexTransformIter(
[&](std::size_t i) { return this->handles_[i]->size; });
return std::accumulate(it, it + this->handles_.size(), static_cast<std::size_t>(0));
}
void GrowOnlyVirtualMemVec::Reserve(std::size_t new_size) {
auto va_capacity = this->Capacity();
if (new_size < va_capacity) {
return;
}
// Try to reserve new virtual address.
auto const aligned_size = RoundUp(new_size, this->granularity_);
auto const new_reserve_size = aligned_size - va_capacity;
CUresult status = CUDA_SUCCESS;
auto hint = this->DevPtr() + va_capacity;
bool failed{false};
auto range = std::make_unique<VaRange>(new_reserve_size, hint, &status, &failed);
if (failed) {
// Failed to reserve the requested address.
// Slow path, try to reserve a new address with full size.
range = std::make_unique<VaRange>(aligned_size, 0ULL, &status, &failed);
safe_cu(status);
CHECK(!failed);
// New allocation is successful. Map the pyhsical address to the virtual address.
// First unmap the existing ptr.
if (this->DevPtr() != 0) {
// Unmap the existing ptr.
safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity()));
// Then remap all the existing physical addresses to the new ptr.
CUdeviceptr ptr = range->DevPtr();
for (auto const &hdl : this->handles_) {
this->MapBlock(ptr, hdl);
ptr += hdl->size;
}
// Release the existing ptr.
va_ranges_.clear();
}
}
va_ranges_.emplace_back(std::move(range));
}
GrowOnlyVirtualMemVec::GrowOnlyVirtualMemVec(CUmemLocationType type)
: prop_{xgboost::cudr::MakeAllocProp(type)},
granularity_{xgboost::cudr::GetAllocGranularity(&this->prop_)} {
CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA);
// Assign the access descriptor
CUmemAccessDesc dacc;
dacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
xgboost::cudr::MakeCuMemLocation(CU_MEM_LOCATION_TYPE_DEVICE, &dacc.location);
this->access_desc_.push_back(dacc);
if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
CUmemAccessDesc hacc;
hacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
xgboost::cudr::MakeCuMemLocation(type, &hacc.location);
this->access_desc_.push_back(hacc);
}
}
[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::Capacity() const {
auto it = xgboost::common::MakeIndexTransformIter(
[&](std::size_t i) { return this->va_ranges_[i]->Size(); });
return std::accumulate(it, it + this->va_ranges_.size(), static_cast<std::size_t>(0));
}
} // namespace detail
#if defined(XGBOOST_USE_RMM)

View File

@ -25,6 +25,8 @@
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
#include <cuda.h>
#include <cstddef> // for size_t
#include <cub/util_allocator.cuh> // for CachingDeviceAllocator
#include <cub/util_device.cuh> // for CurrentDevice
@ -32,8 +34,10 @@
#include <memory> // for unique_ptr
#include <mutex> // for defer_lock
#include "common.h" // for safe_cuda, HumanMemUnit
#include "common.h" // for safe_cuda, HumanMemUnit
#include "cuda_dr_utils.h" // for CuDriverApi
#include "xgboost/logging.h"
#include "xgboost/span.h" // for Span
namespace dh {
namespace detail {
@ -127,6 +131,153 @@ class MemoryLogger {
};
void ThrowOOMError(std::string const &err, std::size_t bytes);
struct GrowOnlyPinnedMemoryImpl {
void *temp_storage{nullptr};
size_t temp_storage_bytes{0};
~GrowOnlyPinnedMemoryImpl() { Free(); }
template <typename T>
xgboost::common::Span<T> GetSpan(size_t size) {
size_t num_bytes = size * sizeof(T);
if (num_bytes > temp_storage_bytes) {
Free();
safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
temp_storage_bytes = num_bytes;
}
return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
}
void Free() {
if (temp_storage != nullptr) {
safe_cuda(cudaFreeHost(temp_storage));
}
}
};
/**
* @brief Use low-level virtual memory functions from CUDA driver API for grow-only memory
* allocation.
*
* @url https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/
*
* Aside from the potential performance benefits, this is primarily implemented to prevent
* deadlock in NCCL and XGBoost. The host NUMA version requires CTK12.5+ to be stable.
*/
class GrowOnlyVirtualMemVec {
static auto RoundUp(std::size_t new_sz, std::size_t chunk_sz) {
return ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz;
}
struct PhyAddrHandle {
CUmemGenericAllocationHandle handle;
std::size_t size;
};
class VaRange {
CUdeviceptr ptr_{0};
std::size_t size_{0};
public:
VaRange(std::size_t size, CUdeviceptr hint, CUresult *p_status, bool *failed) : size_{size} {
CUresult &status = *p_status;
status = xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressReserve(&ptr_, size, 0, hint, 0);
*failed = status != CUDA_SUCCESS || (hint != 0 && ptr_ != hint);
}
~VaRange() {
if (ptr_ != 0) {
xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressFree(ptr_, this->size_);
}
}
VaRange(VaRange const &that) = delete;
VaRange &operator=(VaRange const &that) = delete;
VaRange(VaRange &&that) { std::swap(*this, that); }
VaRange &operator=(VaRange &&that) {
std::swap(*this, that);
return *this;
}
[[nodiscard]] auto DevPtr() const { return this->ptr_; }
[[nodiscard]] std::size_t Size() const { return this->size_; }
};
using PhyHandle = std::unique_ptr<PhyAddrHandle, std::function<void(PhyAddrHandle *)>>;
std::vector<PhyHandle> handles_;
std::vector<std::unique_ptr<VaRange>> va_ranges_;
xgboost::cudr::CuDriverApi &cu_{xgboost::cudr::GetGlobalCuDriverApi()};
std::vector<CUmemAccessDesc> access_desc_;
CUmemAllocationProp const prop_;
// Always use bytes.
std::size_t const granularity_;
[[nodiscard]] std::size_t PhyCapacity() const;
[[nodiscard]] CUdeviceptr DevPtr() const {
if (this->va_ranges_.empty()) {
return 0;
}
return this->va_ranges_.front()->DevPtr();
}
void MapBlock(CUdeviceptr ptr, PhyHandle const &hdl) const {
safe_cu(cu_.cuMemMap(ptr, hdl->size, 0, hdl->handle, 0));
safe_cu(cu_.cuMemSetAccess(ptr, hdl->size, access_desc_.data(), access_desc_.size()));
}
auto CreatePhysicalMem(std::size_t size) const {
CUmemGenericAllocationHandle alloc_handle;
auto padded_size = RoundUp(size, this->granularity_);
CUresult status = this->cu_.cuMemCreate(&alloc_handle, padded_size, &this->prop_, 0);
CHECK_EQ(status, CUDA_SUCCESS);
return alloc_handle;
}
void Reserve(std::size_t new_size);
public:
explicit GrowOnlyVirtualMemVec(CUmemLocationType type);
void GrowTo(std::size_t n_bytes) {
auto alloc_size = this->PhyCapacity();
if (n_bytes <= alloc_size) {
return;
}
std::size_t delta = n_bytes - alloc_size;
auto const padded_delta = RoundUp(delta, this->granularity_);
this->Reserve(alloc_size + padded_delta);
this->handles_.emplace_back(
std::unique_ptr<PhyAddrHandle, std::function<void(PhyAddrHandle *)>>{
new PhyAddrHandle{this->CreatePhysicalMem(padded_delta), padded_delta}, [&](auto *hdl) {
if (hdl) {
cu_.cuMemRelease(hdl->handle);
}
}});
auto ptr = this->DevPtr() + alloc_size;
this->MapBlock(ptr, this->handles_.back());
}
template <typename T>
xgboost::common::Span<T> GetSpan(std::size_t size) {
size_t n_bytes = size * sizeof(T);
this->GrowTo(n_bytes);
return xgboost::common::Span<T>(reinterpret_cast<T *>(this->DevPtr()), size);
}
~GrowOnlyVirtualMemVec() noexcept(false) {
if (this->DevPtr() != 0) {
safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity()));
}
this->va_ranges_.clear(); // make sure all VA are freed before releasing the handles.
this->handles_.clear(); // release the handles
}
[[nodiscard]] void *data() { return reinterpret_cast<void *>(this->DevPtr()); } // NOLINT
[[nodiscard]] std::size_t size() const { return this->PhyCapacity(); } // NOLINT
[[nodiscard]] std::size_t Capacity() const;
};
} // namespace detail
inline detail::MemoryLogger &GlobalMemoryLogger() {

View File

@ -337,7 +337,7 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
int num_cuts_per_feature, bool is_ranking, float missing,
size_t columns, size_t begin, size_t end,
SketchContainer* sketch_container) {
SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
info.weights_.SetDevice(ctx->Device());
auto weights = info.weights_.ConstDeviceSpan();

View File

@ -309,7 +309,7 @@ void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,
void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
common::SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
Span<SketchEntry> out;
dh::device_vector<SketchEntry> cuts;
bool first_window = this->Current().empty();
@ -369,7 +369,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
* pruning or merging. We preserve the first type and remove the second type.
*/
timer_.Start(__func__);
SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
auto key_it = dh::MakeTransformIterator<size_t>(
@ -408,7 +408,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
void SketchContainer::Prune(Context const* ctx, std::size_t to) {
timer_.Start(__func__);
SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
OffsetT to_total = 0;
auto& h_columns_ptr = columns_ptr_b_.HostVector();
@ -443,7 +443,7 @@ void SketchContainer::Prune(Context const* ctx, std::size_t to) {
void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_columns_ptr,
Span<SketchEntry const> that) {
SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
auto self = dh::ToSpan(this->Current());
LOG(DEBUG) << "Merge: self:" << HumanMemUnit(self.size_bytes()) << ". "
<< "That:" << HumanMemUnit(that.size_bytes()) << ". "
@ -507,7 +507,7 @@ void SketchContainer::FixError() {
}
void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
auto world = collective::GetWorldSize();
if (world == 1 || is_column_split) {
return;
@ -596,7 +596,7 @@ struct InvalidCatOp {
void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
timer_.Start(__func__);
SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
p_cuts->min_vals_.Resize(num_columns_);
// Sync between workers.

View File

@ -206,7 +206,7 @@ class SketchContainer {
template <typename KeyComp = thrust::equal_to<size_t>>
size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
timer_.Start(__func__);
SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
this->columns_ptr_.SetDevice(ctx->Device());
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);

View File

@ -9,10 +9,12 @@
#include <fstream> // for ifstream
#include <string> // for string
#include "common.h" // for DivRoundUp
#include "common.h" // for DivRoundUp
#if defined(__linux__)
#include <pthread.h>
#include <sys/syscall.h> // for SYS_getcpu
#include <unistd.h> // for syscall
#endif
namespace xgboost::common {
@ -118,6 +120,14 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
return n_threads;
}
[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) {
#ifdef SYS_getcpu
return syscall(SYS_getcpu, cpu, numa, NULL) == 0;
#else
return false;
#endif
}
void NameThread(std::thread* t, StringView name) {
#if defined(__linux__)
auto handle = t->native_handle();

View File

@ -306,10 +306,16 @@ class MemStackAllocator {
};
/**
* \brief Constant that can be used for initializing static thread local memory.
* @brief Constant that can be used for initializing static thread local memory.
*/
std::int32_t constexpr DefaultMaxThreads() { return 128; }
/**
* @brief Get numa node on Linux. Other platforms are not supported. Returns false if the
* call fails.
*/
[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa);
/**
* @brief Give the thread a name. Supports only pthread on linux.
*/

View File

@ -18,7 +18,7 @@ void Monitor::Start(std::string const &name) {
auto &stats = statistics_map_[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX)
auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
auto range_handle = nvtx3::start_range_in<curt::NvtxDomain>(label_ + "::" + name);
stats.nvtx_id = range_handle.get_value();
#endif // defined(XGBOOST_USE_NVTX)
}
@ -30,7 +30,7 @@ void Monitor::Stop(const std::string &name) {
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX)
nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
nvtx3::end_range_in<curt::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
#endif // defined(XGBOOST_USE_NVTX)
}
}

View File

@ -38,7 +38,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
// When booster is loaded from a memory image (Python pickle or R raw model), number of
// available GPUs could be different. Wrap around it.
std::int32_t n_visible = common::AllVisibleGPUs();
std::int32_t n_visible = curt::AllVisibleGPUs();
if (n_visible == 0) {
if (device.IsCUDA()) {
LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
@ -55,7 +55,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
}
if (device.IsCUDA()) {
common::SetDevice(device.ordinal);
curt::SetDevice(device.ordinal);
}
return device;
}

View File

@ -139,7 +139,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
n_rows{n_rows},
n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
monitor_.Init("ellpack_page");
common::SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
this->InitCompressedData(ctx);
}
@ -154,7 +154,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
row_stride{row_stride},
n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
monitor_.Init("ellpack_page");
common::SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
this->InitCompressedData(ctx);
this->CreateHistIndices(ctx, page, feature_types);
@ -173,7 +173,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
monitor_.Init("ellpack_page");
common::SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
this->InitCompressedData(ctx);
@ -319,7 +319,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m
bst_idx_t n_rows,
std::shared_ptr<common::HistogramCuts const> cuts)
: EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} {
common::SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
if (this->IsDense()) {
CopyDataToEllpack<true>(ctx, batch, feature_types, this, missing);

View File

@ -85,7 +85,7 @@ template <typename T>
bytes += fo->Write(impl->is_dense);
bytes += fo->Write(impl->row_stride);
std::vector<common::CompressedByteT> h_gidx_buffer;
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
Context ctx = Context{}.MakeCUDA(curt::CurrentDevice());
[[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
bytes += common::WriteVec(fo, h_gidx_buffer);
bytes += fo->Write(impl->base_rowid);

View File

@ -202,7 +202,7 @@ EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringVi
*/
template <typename F>
void EllpackPageSourceImpl<F>::Fetch() {
common::SetDevice(this->Device().ordinal);
curt::SetDevice(this->Device().ordinal);
if (!this->ReadCache()) {
if (this->count_ != 0 && !this->sync_) {
// source is initialized to be the 0th page during construction, so when count_ is 0
@ -236,7 +236,7 @@ EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>
*/
template <typename F>
void ExtEllpackPageSourceImpl<F>::Fetch() {
common::SetDevice(this->Device().ordinal);
curt::SetDevice(this->Device().ordinal);
if (!this->ReadCache()) {
auto iter = this->source_->Iter();
CHECK_EQ(this->count_, iter);

View File

@ -61,7 +61,7 @@ template <typename S>
class EllpackFormatPolicy {
std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
DeviceOrd device_;
bool has_hmm_{common::SupportsPageableMem()};
bool has_hmm_{curt::SupportsPageableMem()};
public:
using FormatT = EllpackPageRawFormat;
@ -71,7 +71,7 @@ class EllpackFormatPolicy {
StringView msg{" The overhead of iterating through external memory might be significant."};
if (!has_hmm_) {
LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg;
} else if (!common::SupportsAts()) {
} else if (!curt::SupportsAts()) {
LOG(WARNING) << "CUDA address translation service is not available." << msg;
}
#if !defined(XGBOOST_USE_RMM)
@ -121,7 +121,7 @@ class EllpackCacheStreamPolicy : public F<S> {
template <typename S, template <typename> typename F>
class EllpackMmapStreamPolicy : public F<S> {
bool has_hmm_{common::SupportsPageableMem()};
bool has_hmm_{curt::SupportsPageableMem()};
public:
using WriterT = common::AlignedFileWriteStream;

View File

@ -64,8 +64,8 @@ void MakeSketches(Context const* ctx,
* Get the data shape.
*/
// We use do while here as the first batch is fetched in ctor
CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
common::SetDevice(dh::GetDevice(ctx).ordinal);
CHECK_LT(ctx->Ordinal(), curt::AllVisibleGPUs());
curt::SetDevice(dh::GetDevice(ctx).ordinal);
if (ext_info.n_features == 0) {
ext_info.n_features = data::BatchColumns(proxy);
auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1),
@ -124,7 +124,7 @@ void MakeSketches(Context const* ctx,
ext_info.base_rows.begin());
// Get reference
common::SetDevice(dh::GetDevice(ctx).ordinal);
curt::SetDevice(dh::GetDevice(ctx).ordinal);
if (!ref) {
HostDeviceVector<FeatureType> ft;
common::SketchContainer final_sketch(

View File

@ -37,7 +37,7 @@ struct GBLinearTrainParam : public XGBoostParameter<GBLinearTrainParam> {
size_t max_row_perbatch;
void CheckGPUSupport() {
auto n_gpus = common::AllVisibleGPUs();
auto n_gpus = curt::AllVisibleGPUs();
if (n_gpus == 0 && this->updater == "gpu_coord_descent") {
common::AssertGPUSupport();
this->UpdateAllowUnknown(Args{{"updater", "coord_descent"}});

View File

@ -105,7 +105,7 @@ void GBTree::Configure(Args const& cfg) {
}
cpu_predictor_->Configure(cfg);
#if defined(XGBOOST_USE_CUDA)
auto n_gpus = common::AllVisibleGPUs();
auto n_gpus = curt::AllVisibleGPUs();
if (!gpu_predictor_) {
gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
}
@ -344,7 +344,7 @@ void GBTree::LoadConfig(Json const& in) {
// This would cause all trees to be pushed to trees_to_update
// e.g. updating a model, then saving and loading it would result in an empty model
tparam_.process_type = TreeProcessType::kDefault;
std::int32_t const n_gpus = common::AllVisibleGPUs();
std::int32_t const n_gpus = curt::AllVisibleGPUs();
auto msg = StringView{
R"(

View File

@ -482,7 +482,7 @@ void ExtractPaths(Context const* ctx,
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
DeviceOrd device) {
common::SetDevice(device.ordinal);
curt::SetDevice(device.ordinal);
auto& device_model = *model;
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@ -937,7 +937,7 @@ class GPUPredictor : public xgboost::Predictor {
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
~GPUPredictor() override {
if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
if (ctx_->IsCUDA() && ctx_->Ordinal() < curt::AllVisibleGPUs()) {
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
}
}

View File

@ -184,7 +184,7 @@ struct GPUHistMakerDevice {
// Reset values for each update iteration
[[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
this->monitor.Start(__func__);
common::SetDevice(ctx_->Ordinal());
curt::SetDevice(ctx_->Ordinal());
auto const& info = p_fmat->Info();
@ -789,7 +789,7 @@ class GPUHistMaker : public TreeUpdater {
// Used in test to count how many configurations are performed
LOG(DEBUG) << "[GPU Hist]: Configure";
hist_maker_param_.UpdateAllowUnknown(args);
common::CheckComputeCapability();
curt::CheckComputeCapability();
initialised_ = false;
monitor_.Init("updater_gpu_hist");
@ -835,7 +835,7 @@ class GPUHistMaker : public TreeUpdater {
ctx_, linalg::MakeVec(&column_sampling_seed, sizeof(column_sampling_seed)), 0));
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
common::SetDevice(ctx_->Ordinal());
curt::SetDevice(ctx_->Ordinal());
p_fmat->Info().feature_types.SetDevice(ctx_->Device());
std::vector<bst_idx_t> batch_ptr;
@ -909,7 +909,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
// Used in test to count how many configurations are performed
LOG(DEBUG) << "[GPU Approx]: Configure";
hist_maker_param_.UpdateAllowUnknown(args);
common::CheckComputeCapability();
curt::CheckComputeCapability();
initialised_ = false;
monitor_.Init(this->Name());

View File

@ -94,7 +94,7 @@ class MGPUAllgatherTest : public SocketTest {};
} // namespace
TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
auto n_workers = common::AllVisibleGPUs();
auto n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
Worker w{host, port, timeout, n_workers, r};
@ -105,7 +105,7 @@ TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
}
TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
auto n_workers = common::AllVisibleGPUs();
auto n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
Worker w{host, port, timeout, n_workers, r};

View File

@ -5,7 +5,7 @@
#include <gtest/gtest.h>
#include <thrust/host_vector.h> // for host_vector
#include "../../../src/common/common.h" // for AllVisibleGPUs
#include "../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
#include "../../../src/common/type.h" // for EraseType
#include "test_worker.cuh" // for NCCLWorkerForTest
@ -46,7 +46,7 @@ class Worker : public NCCLWorkerForTest {
} // namespace
TEST_F(MGPUAllreduceTest, BitOr) {
auto n_workers = common::AllVisibleGPUs();
auto n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
Worker w{host, port, timeout, n_workers, r};
@ -56,7 +56,7 @@ TEST_F(MGPUAllreduceTest, BitOr) {
}
TEST_F(MGPUAllreduceTest, Sum) {
auto n_workers = common::AllVisibleGPUs();
auto n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
Worker w{host, port, timeout, n_workers, r};

View File

@ -37,7 +37,7 @@ TEST_F(CommGroupTest, Basic) {
#if defined(XGBOOST_USE_NCCL)
TEST_F(CommGroupTest, BasicGPU) {
std::int32_t n_workers = common::AllVisibleGPUs();
std::int32_t n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
auto ctx = MakeCUDACtx(r);

View File

@ -205,7 +205,7 @@ class BaseMGPUTest : public ::testing::Test {
template <typename Fn>
auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
[[maybe_unused]] bool emulate_if_single = false) const {
auto n_gpus = common::AllVisibleGPUs();
auto n_gpus = curt::AllVisibleGPUs();
if (is_federated) {
#if defined(XGBOOST_USE_FEDERATED)
if (n_gpus == 1 && emulate_if_single) {

View File

@ -3,6 +3,11 @@
*/
#include <gtest/gtest.h>
#include <numeric> // for iota
#include <thrust/detail/sequence.inl> // for sequence
#include "../../../src/common/cuda_rt_utils.h" // for DrVersion
#include "../../../src/common/device_helpers.cuh" // for CachingThrustPolicy, PinnedMemory
#include "../../../src/common/device_vector.cuh"
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
@ -18,4 +23,96 @@ TEST(DeviceUVector, Basic) {
ASSERT_EQ(peak, n_bytes);
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
}
#if defined(__linux__)
namespace {
class TestVirtualMem : public ::testing::TestWithParam<CUmemLocationType> {
public:
void Run() {
auto type = this->GetParam();
detail::GrowOnlyVirtualMemVec vec{type};
auto prop = xgboost::cudr::MakeAllocProp(type);
auto gran = xgboost::cudr::GetAllocGranularity(&prop);
ASSERT_GE(gran, 2);
auto data = vec.GetSpan<std::int32_t>(32); // should be smaller than granularity
ASSERT_EQ(data.size(), 32);
static_assert(std::is_same_v<typename decltype(data)::value_type, std::int32_t>);
std::vector<std::int32_t> h_data(data.size());
auto check = [&] {
for (std::size_t i = 0; i < h_data.size(); ++i) {
ASSERT_EQ(h_data[i], i);
}
};
auto fill = [&](std::int32_t n_orig, xgboost::common::Span<std::int32_t> data) {
if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
thrust::sequence(dh::CachingThrustPolicy(), data.data() + n_orig, data.data() + data.size(),
n_orig);
dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault));
} else {
std::iota(data.data() + n_orig, data.data() + data.size(), n_orig);
std::copy_n(data.data(), data.size(), h_data.data());
}
};
fill(0, data);
check();
auto n_orig = data.size();
// Should be smaller than granularity, use already reserved.
data = vec.GetSpan<std::int32_t>(128);
h_data.resize(data.size());
fill(n_orig, data);
check();
if (128 < gran) {
ASSERT_EQ(vec.Capacity(), gran);
}
n_orig = data.size();
data = vec.GetSpan<std::int32_t>(gran / 2);
h_data.resize(data.size());
fill(n_orig, data);
check();
ASSERT_EQ(vec.Capacity(), gran * 2);
n_orig = data.size();
data = vec.GetSpan<std::int32_t>(gran);
h_data.resize(data.size());
fill(n_orig, data);
check();
ASSERT_EQ(vec.Capacity(), gran * 4);
}
};
} // anonymous namespace
TEST_P(TestVirtualMem, Alloc) { this->Run(); }
INSTANTIATE_TEST_SUITE_P(
Basic, TestVirtualMem,
::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA),
[](::testing::TestParamInfo<TestVirtualMem::ParamType> const& info) -> char const* {
auto type = info.param;
switch (type) {
case CU_MEM_LOCATION_TYPE_DEVICE:
return "Device";
case CU_MEM_LOCATION_TYPE_HOST_NUMA:
return "HostNuma";
default:
LOG(FATAL) << "unreachable";
}
return nullptr;
});
#endif // defined(__linux__)
TEST(TestVirtualMem, Version) {
std::int32_t major, minor;
xgboost::curt::DrVersion(&major, &minor);
LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor;
PinnedMemory pinned;
if (major >= 12 && minor >= 5) {
ASSERT_TRUE(pinned.IsVm());
} else {
ASSERT_FALSE(pinned.IsVm());
}
}
} // namespace dh

View File

@ -578,7 +578,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
namespace {
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
common::SetDevice(ctx->Ordinal());
curt::SetDevice(ctx->Ordinal());
auto n = n_samples * n_features;
std::vector<float> x;
x.resize(n);

View File

@ -100,7 +100,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
}
void TestHostDeviceVector(size_t n, DeviceOrd device) {
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
HostDeviceVector<int> v;
InitHostDeviceVector(n, device, &v);
CheckDevice(&v, n, 0, GPUAccess::kRead);
@ -119,7 +119,7 @@ TEST(HostDeviceVector, Basic) {
TEST(HostDeviceVector, Copy) {
size_t n = 1001;
auto device = DeviceOrd::CUDA(0);
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
HostDeviceVector<int> v;
{

View File

@ -72,7 +72,7 @@ TEST_P(TestEllpackPageRawFormat, DiskIO) {
}
TEST_P(TestEllpackPageRawFormat, DiskIOHmm) {
if (common::SupportsPageableMem()) {
if (curt::SupportsPageableMem()) {
EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{true};
this->Run(&policy, this->GetParam());
} else {

View File

@ -655,7 +655,7 @@ class RMMAllocator {
std::vector<std::unique_ptr<CUDAMemoryResource>> cuda_mr;
std::vector<std::unique_ptr<PoolMemoryResource>> pool_mr;
int n_gpu;
RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
RMMAllocator() : n_gpu(curt::AllVisibleGPUs()) {
int current_device;
CHECK_EQ(cudaGetDevice(&current_device), cudaSuccess);
for (int i = 0; i < n_gpu; ++i) {
@ -697,5 +697,5 @@ void DeleteRMMResource(RMMAllocator*) {}
RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
#endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
std::int32_t DistGpuIdx() { return curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
} // namespace xgboost

View File

@ -34,7 +34,7 @@
#endif
#if defined(__CUDACC__)
#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
#define GPUIDX (curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
#else
#define GPUIDX (-1)
#endif

View File

@ -47,7 +47,7 @@ class TestDistributedMetric : public ::testing::TestWithParam<Param> {
std::int32_t n_workers{0};
if (device.IsCUDA()) {
n_workers = common::AllVisibleGPUs();
n_workers = curt::AllVisibleGPUs();
} else {
n_workers = std::min(static_cast<std::int32_t>(std::thread::hardware_concurrency()), 3);
}

View File

@ -102,14 +102,14 @@ void TestAllgatherV(std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
} // namespace
TEST_F(FederatedCollTestGPU, Allreduce) {
std::int32_t n_workers = common::AllVisibleGPUs();
std::int32_t n_workers = curt::AllVisibleGPUs();
TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
TestAllreduce(comm, rank, n_workers);
});
}
TEST(FederatedCollGPUGlobal, Allreduce) {
std::int32_t n_workers = common::AllVisibleGPUs();
std::int32_t n_workers = curt::AllVisibleGPUs();
TestFederatedGlobal(n_workers, [&] {
auto r = collective::GetRank();
auto world = collective::GetWorldSize();
@ -135,14 +135,14 @@ TEST(FederatedCollGPUGlobal, Allreduce) {
}
TEST_F(FederatedCollTestGPU, Broadcast) {
std::int32_t n_workers = common::AllVisibleGPUs();
std::int32_t n_workers = curt::AllVisibleGPUs();
TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
TestBroadcast(comm, rank);
});
}
TEST_F(FederatedCollTestGPU, Allgather) {
std::int32_t n_workers = common::AllVisibleGPUs();
std::int32_t n_workers = curt::AllVisibleGPUs();
TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
TestAllgather(comm, rank, n_workers);
});
@ -150,7 +150,7 @@ TEST_F(FederatedCollTestGPU, Allgather) {
TEST_F(FederatedCollTestGPU, AllgatherV) {
std::int32_t n_workers = 2;
if (common::AllVisibleGPUs() < n_workers) {
if (curt::AllVisibleGPUs() < n_workers) {
GTEST_SKIP_("At least 2 GPUs are required for the test.");
}
TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {

View File

@ -10,7 +10,7 @@
namespace xgboost::collective {
TEST(CommGroup, Federated) {
std::int32_t n_workers = common::AllVisibleGPUs();
std::int32_t n_workers = curt::AllVisibleGPUs();
TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
Context ctx;
ASSERT_EQ(comm_group->Rank(), r);

View File

@ -11,7 +11,7 @@
namespace xgboost::collective {
TEST(CommGroup, FederatedGPU) {
std::int32_t n_workers = common::AllVisibleGPUs();
std::int32_t n_workers = curt::AllVisibleGPUs();
TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
Context ctx = MakeCUDACtx(0);
auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CUDA(0));

View File

@ -299,7 +299,7 @@ TEST(GPUPredictor, IterationRange) {
}
TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true);
TestIterationRangeColumnSplit(curt::AllVisibleGPUs(), true);
}
TEST(GPUPredictor, CategoricalPrediction) {
@ -312,7 +312,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
}
TEST(GPUPredictor, CategoricalPredictLeaf) {
auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
auto ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
TestCategoricalPredictLeaf(&ctx, false);
}
@ -358,7 +358,7 @@ TEST(GPUPredictor, Sparse) {
}
TEST_F(MGPUPredictorTest, SparseColumnSplit) {
TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2);
TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8);
TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.2);
TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.8);
}
} // namespace xgboost::predictor

View File

@ -320,7 +320,7 @@ void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
Context ctx;
if (use_gpu) {
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
}
auto learner = LearnerForTest(&ctx, m_train, kIters);
auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
@ -354,7 +354,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
Context ctx;
if (use_gpu) {
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
}
size_t constexpr kCols = 10;
PredictionCacheEntry out_predictions;
@ -507,7 +507,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
auto const rank = collective::GetRank();
Context ctx;
if (use_gpu) {
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
}
auto n_threads = collective::GetWorkerLocalThreads(world_size);
ctx.UpdateAllowUnknown(
@ -679,7 +679,7 @@ void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::siz
std::vector<float> const &expected_predt) {
Context ctx;
if (use_gpu) {
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
}
auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};

View File

@ -30,7 +30,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
TEST(Context, DeviceOrdinal) {
Context ctx;
auto n_vis = common::AllVisibleGPUs();
auto n_vis = curt::AllVisibleGPUs();
auto ord = n_vis - 1;
std::string device = "cuda:" + std::to_string(ord);
@ -82,7 +82,7 @@ TEST(Context, GPUId) {
ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
TestCUDA(ctx, 0);
auto n_vis = common::AllVisibleGPUs();
auto n_vis = curt::AllVisibleGPUs();
auto ord = n_vis - 1;
ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
TestCUDA(ctx, ord);

View File

@ -759,7 +759,7 @@ void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args
auto world_size{3};
if (use_gpu) {
world_size = common::AllVisibleGPUs();
world_size = curt::AllVisibleGPUs();
// Simulate MPU on a single GPU. Federated doesn't use nccl, can run multiple
// instances on the same GPU.
if (world_size == 1 && federated) {

View File

@ -595,7 +595,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
} // anonymous namespace
TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
if (common::AllVisibleGPUs() > 1) {
if (curt::AllVisibleGPUs() > 1) {
// We can't emulate multiple GPUs with NCCL.
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, false, true);
}
@ -603,7 +603,7 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
}
TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
if (common::AllVisibleGPUs() > 1) {
if (curt::AllVisibleGPUs() > 1) {
// We can't emulate multiple GPUs with NCCL.
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, false, true);
}