Use CUDA virtual memory for pinned memory allocation. (#10850)
- Add a grow-only virtual memory allocator. - Define a driver API wrapper. Split up the runtime API wrapper.
This commit is contained in:
parent
13b9874fd6
commit
271f4a80e7
108
src/common/cuda_dr_utils.cc
Normal file
108
src/common/cuda_dr_utils.cc
Normal file
@ -0,0 +1,108 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost contributors
|
||||
*/
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#include "cuda_dr_utils.h"
|
||||
|
||||
#include <algorithm> // for max
|
||||
#include <cstdint> // for int32_t
|
||||
#include <cstring> // for memset
|
||||
#include <memory> // for make_unique
|
||||
#include <mutex> // for call_once
|
||||
#include <sstream> // for stringstream
|
||||
#include <string> // for string
|
||||
|
||||
#include "common.h" // for safe_cuda
|
||||
#include "cuda_rt_utils.h" // for CurrentDevice
|
||||
#include "xgboost/string_view.h" // for StringVie
|
||||
|
||||
namespace xgboost::cudr {
|
||||
CuDriverApi::CuDriverApi() {
|
||||
// similar to dlopen, but without the need to release a handle.
|
||||
auto safe_load = [](xgboost::StringView name, auto **fnptr) {
|
||||
cudaDriverEntryPointQueryResult status;
|
||||
dh::safe_cuda(cudaGetDriverEntryPoint(name.c_str(), reinterpret_cast<void **>(fnptr),
|
||||
cudaEnablePerThreadDefaultStream, &status));
|
||||
CHECK(status == cudaDriverEntryPointSuccess) << name;
|
||||
CHECK(*fnptr);
|
||||
};
|
||||
|
||||
safe_load("cuMemGetAllocationGranularity", &this->cuMemGetAllocationGranularity);
|
||||
safe_load("cuMemCreate", &this->cuMemCreate);
|
||||
safe_load("cuMemMap", &this->cuMemMap);
|
||||
safe_load("cuMemAddressReserve", &this->cuMemAddressReserve);
|
||||
safe_load("cuMemSetAccess", &this->cuMemSetAccess);
|
||||
safe_load("cuMemUnmap", &this->cuMemUnmap);
|
||||
safe_load("cuMemRelease", &this->cuMemRelease);
|
||||
safe_load("cuMemAddressFree", &this->cuMemAddressFree);
|
||||
safe_load("cuGetErrorString", &this->cuGetErrorString);
|
||||
safe_load("cuGetErrorName", &this->cuGetErrorName);
|
||||
safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
|
||||
safe_load("cuDeviceGet", &this->cuDeviceGet);
|
||||
|
||||
CHECK(this->cuMemGetAllocationGranularity);
|
||||
}
|
||||
|
||||
void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line,
|
||||
char const *file) const {
|
||||
if (status == CUDA_SUCCESS) {
|
||||
return;
|
||||
}
|
||||
std::string cuerr{"CUDA driver error:"};
|
||||
|
||||
char const *name{nullptr};
|
||||
auto err0 = this->cuGetErrorName(status, &name);
|
||||
if (err0 != CUDA_SUCCESS) {
|
||||
LOG(WARNING) << cuerr << status << ". Then we failed to get error name:" << err0;
|
||||
}
|
||||
char const *msg{nullptr};
|
||||
auto err1 = this->cuGetErrorString(status, &msg);
|
||||
if (err1 != CUDA_SUCCESS) {
|
||||
LOG(WARNING) << cuerr << status << ". Then we failed to get error string:" << err1;
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
ss << fn << "[" << file << ":" << line << "]:";
|
||||
if (name != nullptr && err0 == CUDA_SUCCESS) {
|
||||
ss << cuerr << " " << name << ".";
|
||||
}
|
||||
if (msg != nullptr && err1 == CUDA_SUCCESS) {
|
||||
ss << " " << msg << "\n";
|
||||
}
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
|
||||
[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() {
|
||||
static std::once_flag flag;
|
||||
static std::unique_ptr<CuDriverApi> cu;
|
||||
std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(); });
|
||||
return *cu;
|
||||
}
|
||||
|
||||
void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
|
||||
auto ordinal = curt::CurrentDevice();
|
||||
loc->type = type;
|
||||
|
||||
if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
|
||||
loc->id = ordinal;
|
||||
} else {
|
||||
std::int32_t numa_id = -1;
|
||||
CUdevice device;
|
||||
safe_cu(GetGlobalCuDriverApi().cuDeviceGet(&device, ordinal));
|
||||
safe_cu(GetGlobalCuDriverApi().cuDeviceGetAttribute(&numa_id, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID,
|
||||
device));
|
||||
numa_id = std::max(numa_id, 0);
|
||||
|
||||
loc->id = numa_id;
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type) {
|
||||
CUmemAllocationProp prop;
|
||||
std::memset(&prop, '\0', sizeof(prop));
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
MakeCuMemLocation(type, &prop.location);
|
||||
return prop;
|
||||
}
|
||||
} // namespace xgboost::cudr
|
||||
#endif
|
||||
105
src/common/cuda_dr_utils.h
Normal file
105
src/common/cuda_dr_utils.h
Normal file
@ -0,0 +1,105 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost contributors
|
||||
*
|
||||
* @brief Utility for CUDA driver API.
|
||||
*
|
||||
* XGBoost doesn't link libcuda.so at build time. The utilities here load the shared
|
||||
* object at runtime.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include <cstdint> // for int32_t
|
||||
|
||||
#include "xgboost/string_view.h" // for StringView
|
||||
|
||||
namespace xgboost::cudr {
|
||||
/**
|
||||
* @brief A struct for retrieving CUDA driver API from the runtime API.
|
||||
*/
|
||||
struct CuDriverApi {
|
||||
using Flags = unsigned long long; // NOLINT
|
||||
|
||||
// Memroy manipulation functions.
|
||||
using MemGetAllocationGranularityFn = CUresult(size_t *granularity,
|
||||
const CUmemAllocationProp *prop,
|
||||
CUmemAllocationGranularity_flags option);
|
||||
using MemCreateFn = CUresult(CUmemGenericAllocationHandle *handle, size_t size,
|
||||
const CUmemAllocationProp *prop, Flags flags);
|
||||
using MemMapFn = CUresult(CUdeviceptr ptr, size_t size, size_t offset,
|
||||
CUmemGenericAllocationHandle handle, Flags flags);
|
||||
using MemAddressReserveFn = CUresult(CUdeviceptr *ptr, size_t size, size_t alignment,
|
||||
CUdeviceptr addr, Flags flags);
|
||||
using MemSetAccessFn = CUresult(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc,
|
||||
size_t count);
|
||||
using MemUnmapFn = CUresult(CUdeviceptr ptr, size_t size);
|
||||
using MemReleaseFn = CUresult(CUmemGenericAllocationHandle handle);
|
||||
using MemAddressFreeFn = CUresult(CUdeviceptr ptr, size_t size);
|
||||
// Error handling
|
||||
using GetErrorString = CUresult(CUresult error, const char **pStr);
|
||||
using GetErrorName = CUresult(CUresult error, const char **pStr);
|
||||
// Device attributes
|
||||
using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev);
|
||||
using DeviceGet = CUresult(CUdevice *device, int ordinal);
|
||||
|
||||
MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr}; // NOLINT
|
||||
MemCreateFn *cuMemCreate{nullptr}; // NOLINT
|
||||
/**
|
||||
* @param[in] offset - Must be zero.
|
||||
*/
|
||||
MemMapFn *cuMemMap{nullptr}; // NOLINT
|
||||
/**
|
||||
* @param[out] ptr - Resulting pointer to start of virtual address range allocated
|
||||
* @param[in] size - Size of the reserved virtual address range requested
|
||||
* @param[in] alignment - Alignment of the reserved virtual address range requested
|
||||
* @param[in] addr - Fixed starting address range requested
|
||||
* @param[in] flags - Currently unused, must be zero
|
||||
*/
|
||||
MemAddressReserveFn *cuMemAddressReserve{nullptr}; // NOLINT
|
||||
MemSetAccessFn *cuMemSetAccess{nullptr}; // NOLINT
|
||||
MemUnmapFn *cuMemUnmap{nullptr}; // NOLINT
|
||||
MemReleaseFn *cuMemRelease{nullptr}; // NOLINT
|
||||
MemAddressFreeFn *cuMemAddressFree{nullptr}; // NOLINT
|
||||
GetErrorString *cuGetErrorString{nullptr}; // NOLINT
|
||||
GetErrorName *cuGetErrorName{nullptr}; // NOLINT
|
||||
DeviceGetAttribute *cuDeviceGetAttribute{nullptr}; // NOLINT
|
||||
DeviceGet *cuDeviceGet{nullptr}; // NOLINT
|
||||
|
||||
CuDriverApi();
|
||||
|
||||
void ThrowIfError(CUresult status, StringView fn, std::int32_t line, char const *file) const;
|
||||
};
|
||||
|
||||
[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi();
|
||||
|
||||
/**
|
||||
* @brief Macro for guarding CUDA driver API calls.
|
||||
*/
|
||||
#define safe_cu(call) \
|
||||
do { \
|
||||
auto __status = (call); \
|
||||
if (__status != CUDA_SUCCESS) { \
|
||||
::xgboost::cudr::GetGlobalCuDriverApi().ThrowIfError(__status, #call, __LINE__, __FILE__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Get the allocation granularity.
|
||||
inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
|
||||
std::size_t granularity;
|
||||
safe_cu(GetGlobalCuDriverApi().cuMemGetAllocationGranularity(
|
||||
&granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
return granularity;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Obtain appropriate device ordinal for `CUmemLocation`.
|
||||
*/
|
||||
void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
|
||||
|
||||
/**
|
||||
* @brief Construct a `CUmemAllocationProp`.
|
||||
*/
|
||||
[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
|
||||
} // namespace xgboost::cudr
|
||||
@ -8,10 +8,11 @@
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
|
||||
#include <cstdint> // for int32_t
|
||||
#include <mutex> // for once_flag, call_once
|
||||
|
||||
#include "common.h" // for safe_cuda
|
||||
|
||||
namespace xgboost::common {
|
||||
namespace xgboost::curt {
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
std::int32_t AllVisibleGPUs() {
|
||||
int n_visgpus = 0;
|
||||
@ -19,7 +20,7 @@ std::int32_t AllVisibleGPUs() {
|
||||
// When compiled with CUDA but running on CPU only device,
|
||||
// cudaGetDeviceCount will fail.
|
||||
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
|
||||
} catch (const dmlc::Error &) {
|
||||
} catch (const dmlc::Error&) {
|
||||
cudaGetLastError(); // reset error.
|
||||
return 0;
|
||||
}
|
||||
@ -63,11 +64,36 @@ void SetDevice(std::int32_t device) {
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <typename Fn>
|
||||
void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) {
|
||||
static std::int32_t version = 0;
|
||||
static std::once_flag flag;
|
||||
std::call_once(flag, [&] { fn(&version); });
|
||||
if (major) {
|
||||
*major = version / 1000;
|
||||
}
|
||||
if (minor) {
|
||||
*minor = version % 100 / 10;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void RtVersion(std::int32_t* major, std::int32_t* minor) {
|
||||
GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaRuntimeGetVersion(ver)); }, major,
|
||||
minor);
|
||||
}
|
||||
|
||||
void DrVersion(std::int32_t* major, std::int32_t* minor) {
|
||||
GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
|
||||
}
|
||||
|
||||
#else
|
||||
std::int32_t AllVisibleGPUs() { return 0; }
|
||||
|
||||
std::int32_t CurrentDevice() {
|
||||
AssertGPUSupport();
|
||||
common::AssertGPUSupport();
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -79,8 +105,8 @@ void CheckComputeCapability() {}
|
||||
|
||||
void SetDevice(std::int32_t device) {
|
||||
if (device >= 0) {
|
||||
AssertGPUSupport();
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
} // namespace xgboost::common
|
||||
} // namespace xgboost::curt
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#include <nvtx3/nvtx3.hpp>
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
|
||||
namespace xgboost::common {
|
||||
namespace xgboost::curt {
|
||||
std::int32_t AllVisibleGPUs();
|
||||
|
||||
std::int32_t CurrentDevice();
|
||||
@ -24,6 +24,12 @@ void CheckComputeCapability();
|
||||
|
||||
void SetDevice(std::int32_t device);
|
||||
|
||||
// Returns the CUDA Runtime version.
|
||||
void RtVersion(std::int32_t* major, std::int32_t* minor);
|
||||
|
||||
// Returns the latest version of CUDA supported by the driver.
|
||||
void DrVersion(std::int32_t* major, std::int32_t* minor);
|
||||
|
||||
struct NvtxDomain {
|
||||
static constexpr char const *name{"libxgboost"}; // NOLINT
|
||||
};
|
||||
@ -49,10 +55,10 @@ class NvtxRgb {
|
||||
explicit NvtxRgb(Args &&...) {}
|
||||
};
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
} // namespace xgboost::common
|
||||
} // namespace xgboost::curt
|
||||
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
|
||||
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::curt::NvtxDomain)
|
||||
#else
|
||||
#define xgboost_NVTX_FN_RANGE()
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
|
||||
23
src/common/device_helpers.cu
Normal file
23
src/common/device_helpers.cu
Normal file
@ -0,0 +1,23 @@
|
||||
/**
|
||||
* Copyright 2024, XGBoost contributors
|
||||
*/
|
||||
#include "cuda_rt_utils.h" // for RtVersion
|
||||
#include "device_helpers.cuh"
|
||||
#include "xgboost/windefs.h" // for xgboost_IS_WIN
|
||||
|
||||
namespace dh {
|
||||
PinnedMemory::PinnedMemory() {
|
||||
#if defined(xgboost_IS_WIN)
|
||||
this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
|
||||
#else
|
||||
std::int32_t major{0}, minor{0};
|
||||
xgboost::curt::DrVersion(&major, &minor);
|
||||
// Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable.
|
||||
if (major >= 12 && minor >= 5) {
|
||||
this->impl_.emplace<detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_HOST_NUMA);
|
||||
} else {
|
||||
this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} // namespace dh
|
||||
@ -16,7 +16,8 @@
|
||||
#include <cstddef> // for size_t
|
||||
#include <cub/cub.cuh>
|
||||
#include <cub/util_type.cuh> // for UnitWord, DoubleBuffer
|
||||
#include <vector>
|
||||
#include <variant> // for variant, visit
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "common.h"
|
||||
#include "device_vector.cuh"
|
||||
@ -372,36 +373,25 @@ void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<const T>
|
||||
}
|
||||
|
||||
// Keep track of pinned memory allocation
|
||||
struct PinnedMemory {
|
||||
void *temp_storage{nullptr};
|
||||
size_t temp_storage_bytes{0};
|
||||
class PinnedMemory {
|
||||
std::variant<detail::GrowOnlyPinnedMemoryImpl, detail::GrowOnlyVirtualMemVec> impl_;
|
||||
|
||||
~PinnedMemory() { Free(); }
|
||||
public:
|
||||
PinnedMemory();
|
||||
|
||||
template <typename T>
|
||||
xgboost::common::Span<T> GetSpan(size_t size) {
|
||||
size_t num_bytes = size * sizeof(T);
|
||||
if (num_bytes > temp_storage_bytes) {
|
||||
Free();
|
||||
safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
|
||||
temp_storage_bytes = num_bytes;
|
||||
return std::visit([&](auto &&alloc) { return alloc.template GetSpan<T>(size); }, this->impl_);
|
||||
}
|
||||
return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
xgboost::common::Span<T> GetSpan(size_t size, T init) {
|
||||
xgboost::common::Span<T> GetSpan(size_t size, T const &init) {
|
||||
auto result = this->GetSpan<T>(size);
|
||||
for (auto &e : result) {
|
||||
e = init;
|
||||
}
|
||||
std::fill_n(result.data(), result.size(), init);
|
||||
return result;
|
||||
}
|
||||
|
||||
void Free() {
|
||||
if (temp_storage != nullptr) {
|
||||
safe_cuda(cudaFreeHost(temp_storage));
|
||||
}
|
||||
// Used for testing.
|
||||
[[nodiscard]] bool IsVm() {
|
||||
return std::get_if<detail::GrowOnlyVirtualMemVec>(&this->impl_) != nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -1,10 +1,14 @@
|
||||
/**
|
||||
* Copyright 2017-2024, XGBoost contributors
|
||||
*/
|
||||
#include <numeric> // for accumulate
|
||||
|
||||
#include "../collective/communicator-inl.h" // for GetRank
|
||||
#include "common.h" // for HumanMemUnit
|
||||
#include "cuda_dr_utils.h"
|
||||
#include "device_helpers.cuh" // for CurrentDevice
|
||||
#include "device_vector.cuh"
|
||||
#include "transform_iterator.h" // for MakeIndexTransformIter
|
||||
|
||||
namespace dh {
|
||||
namespace detail {
|
||||
@ -18,6 +22,79 @@ void ThrowOOMError(std::string const &err, std::size_t bytes) {
|
||||
<< "- Requested memory: " << HumanMemUnit(bytes) << std::endl;
|
||||
LOG(FATAL) << ss.str();
|
||||
}
|
||||
|
||||
[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::PhyCapacity() const {
|
||||
auto it = xgboost::common::MakeIndexTransformIter(
|
||||
[&](std::size_t i) { return this->handles_[i]->size; });
|
||||
return std::accumulate(it, it + this->handles_.size(), static_cast<std::size_t>(0));
|
||||
}
|
||||
|
||||
void GrowOnlyVirtualMemVec::Reserve(std::size_t new_size) {
|
||||
auto va_capacity = this->Capacity();
|
||||
if (new_size < va_capacity) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Try to reserve new virtual address.
|
||||
auto const aligned_size = RoundUp(new_size, this->granularity_);
|
||||
auto const new_reserve_size = aligned_size - va_capacity;
|
||||
CUresult status = CUDA_SUCCESS;
|
||||
auto hint = this->DevPtr() + va_capacity;
|
||||
|
||||
bool failed{false};
|
||||
auto range = std::make_unique<VaRange>(new_reserve_size, hint, &status, &failed);
|
||||
if (failed) {
|
||||
// Failed to reserve the requested address.
|
||||
// Slow path, try to reserve a new address with full size.
|
||||
range = std::make_unique<VaRange>(aligned_size, 0ULL, &status, &failed);
|
||||
safe_cu(status);
|
||||
CHECK(!failed);
|
||||
|
||||
// New allocation is successful. Map the pyhsical address to the virtual address.
|
||||
// First unmap the existing ptr.
|
||||
if (this->DevPtr() != 0) {
|
||||
// Unmap the existing ptr.
|
||||
safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity()));
|
||||
|
||||
// Then remap all the existing physical addresses to the new ptr.
|
||||
CUdeviceptr ptr = range->DevPtr();
|
||||
for (auto const &hdl : this->handles_) {
|
||||
this->MapBlock(ptr, hdl);
|
||||
ptr += hdl->size;
|
||||
}
|
||||
|
||||
// Release the existing ptr.
|
||||
va_ranges_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
va_ranges_.emplace_back(std::move(range));
|
||||
}
|
||||
|
||||
GrowOnlyVirtualMemVec::GrowOnlyVirtualMemVec(CUmemLocationType type)
|
||||
: prop_{xgboost::cudr::MakeAllocProp(type)},
|
||||
granularity_{xgboost::cudr::GetAllocGranularity(&this->prop_)} {
|
||||
CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA);
|
||||
// Assign the access descriptor
|
||||
CUmemAccessDesc dacc;
|
||||
dacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
xgboost::cudr::MakeCuMemLocation(CU_MEM_LOCATION_TYPE_DEVICE, &dacc.location);
|
||||
this->access_desc_.push_back(dacc);
|
||||
|
||||
if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
|
||||
CUmemAccessDesc hacc;
|
||||
hacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
|
||||
xgboost::cudr::MakeCuMemLocation(type, &hacc.location);
|
||||
this->access_desc_.push_back(hacc);
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::Capacity() const {
|
||||
auto it = xgboost::common::MakeIndexTransformIter(
|
||||
[&](std::size_t i) { return this->va_ranges_[i]->Size(); });
|
||||
return std::accumulate(it, it + this->va_ranges_.size(), static_cast<std::size_t>(0));
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
#if defined(XGBOOST_USE_RMM)
|
||||
|
||||
@ -25,6 +25,8 @@
|
||||
|
||||
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#include <cstddef> // for size_t
|
||||
#include <cub/util_allocator.cuh> // for CachingDeviceAllocator
|
||||
#include <cub/util_device.cuh> // for CurrentDevice
|
||||
@ -33,7 +35,9 @@
|
||||
#include <mutex> // for defer_lock
|
||||
|
||||
#include "common.h" // for safe_cuda, HumanMemUnit
|
||||
#include "cuda_dr_utils.h" // for CuDriverApi
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/span.h" // for Span
|
||||
|
||||
namespace dh {
|
||||
namespace detail {
|
||||
@ -127,6 +131,153 @@ class MemoryLogger {
|
||||
};
|
||||
|
||||
void ThrowOOMError(std::string const &err, std::size_t bytes);
|
||||
|
||||
struct GrowOnlyPinnedMemoryImpl {
|
||||
void *temp_storage{nullptr};
|
||||
size_t temp_storage_bytes{0};
|
||||
|
||||
~GrowOnlyPinnedMemoryImpl() { Free(); }
|
||||
|
||||
template <typename T>
|
||||
xgboost::common::Span<T> GetSpan(size_t size) {
|
||||
size_t num_bytes = size * sizeof(T);
|
||||
if (num_bytes > temp_storage_bytes) {
|
||||
Free();
|
||||
safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
|
||||
temp_storage_bytes = num_bytes;
|
||||
}
|
||||
return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
|
||||
}
|
||||
|
||||
void Free() {
|
||||
if (temp_storage != nullptr) {
|
||||
safe_cuda(cudaFreeHost(temp_storage));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Use low-level virtual memory functions from CUDA driver API for grow-only memory
|
||||
* allocation.
|
||||
*
|
||||
* @url https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/
|
||||
*
|
||||
* Aside from the potential performance benefits, this is primarily implemented to prevent
|
||||
* deadlock in NCCL and XGBoost. The host NUMA version requires CTK12.5+ to be stable.
|
||||
*/
|
||||
class GrowOnlyVirtualMemVec {
|
||||
static auto RoundUp(std::size_t new_sz, std::size_t chunk_sz) {
|
||||
return ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz;
|
||||
}
|
||||
|
||||
struct PhyAddrHandle {
|
||||
CUmemGenericAllocationHandle handle;
|
||||
std::size_t size;
|
||||
};
|
||||
|
||||
class VaRange {
|
||||
CUdeviceptr ptr_{0};
|
||||
std::size_t size_{0};
|
||||
|
||||
public:
|
||||
VaRange(std::size_t size, CUdeviceptr hint, CUresult *p_status, bool *failed) : size_{size} {
|
||||
CUresult &status = *p_status;
|
||||
status = xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressReserve(&ptr_, size, 0, hint, 0);
|
||||
*failed = status != CUDA_SUCCESS || (hint != 0 && ptr_ != hint);
|
||||
}
|
||||
~VaRange() {
|
||||
if (ptr_ != 0) {
|
||||
xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressFree(ptr_, this->size_);
|
||||
}
|
||||
}
|
||||
|
||||
VaRange(VaRange const &that) = delete;
|
||||
VaRange &operator=(VaRange const &that) = delete;
|
||||
|
||||
VaRange(VaRange &&that) { std::swap(*this, that); }
|
||||
VaRange &operator=(VaRange &&that) {
|
||||
std::swap(*this, that);
|
||||
return *this;
|
||||
}
|
||||
[[nodiscard]] auto DevPtr() const { return this->ptr_; }
|
||||
[[nodiscard]] std::size_t Size() const { return this->size_; }
|
||||
};
|
||||
|
||||
using PhyHandle = std::unique_ptr<PhyAddrHandle, std::function<void(PhyAddrHandle *)>>;
|
||||
std::vector<PhyHandle> handles_;
|
||||
std::vector<std::unique_ptr<VaRange>> va_ranges_;
|
||||
|
||||
xgboost::cudr::CuDriverApi &cu_{xgboost::cudr::GetGlobalCuDriverApi()};
|
||||
std::vector<CUmemAccessDesc> access_desc_;
|
||||
CUmemAllocationProp const prop_;
|
||||
|
||||
// Always use bytes.
|
||||
std::size_t const granularity_;
|
||||
|
||||
[[nodiscard]] std::size_t PhyCapacity() const;
|
||||
[[nodiscard]] CUdeviceptr DevPtr() const {
|
||||
if (this->va_ranges_.empty()) {
|
||||
return 0;
|
||||
}
|
||||
return this->va_ranges_.front()->DevPtr();
|
||||
}
|
||||
void MapBlock(CUdeviceptr ptr, PhyHandle const &hdl) const {
|
||||
safe_cu(cu_.cuMemMap(ptr, hdl->size, 0, hdl->handle, 0));
|
||||
safe_cu(cu_.cuMemSetAccess(ptr, hdl->size, access_desc_.data(), access_desc_.size()));
|
||||
}
|
||||
auto CreatePhysicalMem(std::size_t size) const {
|
||||
CUmemGenericAllocationHandle alloc_handle;
|
||||
auto padded_size = RoundUp(size, this->granularity_);
|
||||
CUresult status = this->cu_.cuMemCreate(&alloc_handle, padded_size, &this->prop_, 0);
|
||||
CHECK_EQ(status, CUDA_SUCCESS);
|
||||
return alloc_handle;
|
||||
}
|
||||
void Reserve(std::size_t new_size);
|
||||
|
||||
public:
|
||||
explicit GrowOnlyVirtualMemVec(CUmemLocationType type);
|
||||
|
||||
void GrowTo(std::size_t n_bytes) {
|
||||
auto alloc_size = this->PhyCapacity();
|
||||
if (n_bytes <= alloc_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::size_t delta = n_bytes - alloc_size;
|
||||
auto const padded_delta = RoundUp(delta, this->granularity_);
|
||||
this->Reserve(alloc_size + padded_delta);
|
||||
|
||||
this->handles_.emplace_back(
|
||||
std::unique_ptr<PhyAddrHandle, std::function<void(PhyAddrHandle *)>>{
|
||||
new PhyAddrHandle{this->CreatePhysicalMem(padded_delta), padded_delta}, [&](auto *hdl) {
|
||||
if (hdl) {
|
||||
cu_.cuMemRelease(hdl->handle);
|
||||
}
|
||||
}});
|
||||
auto ptr = this->DevPtr() + alloc_size;
|
||||
this->MapBlock(ptr, this->handles_.back());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
xgboost::common::Span<T> GetSpan(std::size_t size) {
|
||||
size_t n_bytes = size * sizeof(T);
|
||||
this->GrowTo(n_bytes);
|
||||
return xgboost::common::Span<T>(reinterpret_cast<T *>(this->DevPtr()), size);
|
||||
}
|
||||
|
||||
~GrowOnlyVirtualMemVec() noexcept(false) {
|
||||
if (this->DevPtr() != 0) {
|
||||
safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity()));
|
||||
}
|
||||
|
||||
this->va_ranges_.clear(); // make sure all VA are freed before releasing the handles.
|
||||
this->handles_.clear(); // release the handles
|
||||
}
|
||||
|
||||
[[nodiscard]] void *data() { return reinterpret_cast<void *>(this->DevPtr()); } // NOLINT
|
||||
[[nodiscard]] std::size_t size() const { return this->PhyCapacity(); } // NOLINT
|
||||
[[nodiscard]] std::size_t Capacity() const;
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
inline detail::MemoryLogger &GlobalMemoryLogger() {
|
||||
|
||||
@ -337,7 +337,7 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
|
||||
int num_cuts_per_feature, bool is_ranking, float missing,
|
||||
size_t columns, size_t begin, size_t end,
|
||||
SketchContainer* sketch_container) {
|
||||
SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
info.weights_.SetDevice(ctx->Device());
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
|
||||
|
||||
@ -309,7 +309,7 @@ void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,
|
||||
|
||||
void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
|
||||
common::SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
Span<SketchEntry> out;
|
||||
dh::device_vector<SketchEntry> cuts;
|
||||
bool first_window = this->Current().empty();
|
||||
@ -369,7 +369,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
|
||||
* pruning or merging. We preserve the first type and remove the second type.
|
||||
*/
|
||||
timer_.Start(__func__);
|
||||
SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
|
||||
|
||||
auto key_it = dh::MakeTransformIterator<size_t>(
|
||||
@ -408,7 +408,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
|
||||
|
||||
void SketchContainer::Prune(Context const* ctx, std::size_t to) {
|
||||
timer_.Start(__func__);
|
||||
SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
|
||||
OffsetT to_total = 0;
|
||||
auto& h_columns_ptr = columns_ptr_b_.HostVector();
|
||||
@ -443,7 +443,7 @@ void SketchContainer::Prune(Context const* ctx, std::size_t to) {
|
||||
|
||||
void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_columns_ptr,
|
||||
Span<SketchEntry const> that) {
|
||||
SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
auto self = dh::ToSpan(this->Current());
|
||||
LOG(DEBUG) << "Merge: self:" << HumanMemUnit(self.size_bytes()) << ". "
|
||||
<< "That:" << HumanMemUnit(that.size_bytes()) << ". "
|
||||
@ -507,7 +507,7 @@ void SketchContainer::FixError() {
|
||||
}
|
||||
|
||||
void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
|
||||
SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
auto world = collective::GetWorldSize();
|
||||
if (world == 1 || is_column_split) {
|
||||
return;
|
||||
@ -596,7 +596,7 @@ struct InvalidCatOp {
|
||||
|
||||
void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
|
||||
timer_.Start(__func__);
|
||||
SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
p_cuts->min_vals_.Resize(num_columns_);
|
||||
|
||||
// Sync between workers.
|
||||
|
||||
@ -206,7 +206,7 @@ class SketchContainer {
|
||||
template <typename KeyComp = thrust::equal_to<size_t>>
|
||||
size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
||||
timer_.Start(__func__);
|
||||
SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
this->columns_ptr_.SetDevice(ctx->Device());
|
||||
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
||||
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
|
||||
|
||||
@ -13,6 +13,8 @@
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <pthread.h>
|
||||
#include <sys/syscall.h> // for SYS_getcpu
|
||||
#include <unistd.h> // for syscall
|
||||
#endif
|
||||
|
||||
namespace xgboost::common {
|
||||
@ -118,6 +120,14 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
|
||||
return n_threads;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) {
|
||||
#ifdef SYS_getcpu
|
||||
return syscall(SYS_getcpu, cpu, numa, NULL) == 0;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
void NameThread(std::thread* t, StringView name) {
|
||||
#if defined(__linux__)
|
||||
auto handle = t->native_handle();
|
||||
|
||||
@ -306,10 +306,16 @@ class MemStackAllocator {
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Constant that can be used for initializing static thread local memory.
|
||||
* @brief Constant that can be used for initializing static thread local memory.
|
||||
*/
|
||||
std::int32_t constexpr DefaultMaxThreads() { return 128; }
|
||||
|
||||
/**
|
||||
* @brief Get numa node on Linux. Other platforms are not supported. Returns false if the
|
||||
* call fails.
|
||||
*/
|
||||
[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa);
|
||||
|
||||
/**
|
||||
* @brief Give the thread a name. Supports only pthread on linux.
|
||||
*/
|
||||
|
||||
@ -18,7 +18,7 @@ void Monitor::Start(std::string const &name) {
|
||||
auto &stats = statistics_map_[name];
|
||||
stats.timer.Start();
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
|
||||
auto range_handle = nvtx3::start_range_in<curt::NvtxDomain>(label_ + "::" + name);
|
||||
stats.nvtx_id = range_handle.get_value();
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
@ -30,7 +30,7 @@ void Monitor::Stop(const std::string &name) {
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
|
||||
nvtx3::end_range_in<curt::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
|
||||
@ -38,7 +38,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
||||
[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
|
||||
// When booster is loaded from a memory image (Python pickle or R raw model), number of
|
||||
// available GPUs could be different. Wrap around it.
|
||||
std::int32_t n_visible = common::AllVisibleGPUs();
|
||||
std::int32_t n_visible = curt::AllVisibleGPUs();
|
||||
if (n_visible == 0) {
|
||||
if (device.IsCUDA()) {
|
||||
LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
|
||||
@ -55,7 +55,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
||||
}
|
||||
|
||||
if (device.IsCUDA()) {
|
||||
common::SetDevice(device.ordinal);
|
||||
curt::SetDevice(device.ordinal);
|
||||
}
|
||||
return device;
|
||||
}
|
||||
|
||||
@ -139,7 +139,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
|
||||
n_rows{n_rows},
|
||||
n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
|
||||
monitor_.Init("ellpack_page");
|
||||
common::SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
|
||||
this->InitCompressedData(ctx);
|
||||
}
|
||||
@ -154,7 +154,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
|
||||
row_stride{row_stride},
|
||||
n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
|
||||
monitor_.Init("ellpack_page");
|
||||
common::SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
|
||||
this->InitCompressedData(ctx);
|
||||
this->CreateHistIndices(ctx, page, feature_types);
|
||||
@ -173,7 +173,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
|
||||
common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
|
||||
n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
|
||||
monitor_.Init("ellpack_page");
|
||||
common::SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
|
||||
this->InitCompressedData(ctx);
|
||||
|
||||
@ -319,7 +319,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m
|
||||
bst_idx_t n_rows,
|
||||
std::shared_ptr<common::HistogramCuts const> cuts)
|
||||
: EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} {
|
||||
common::SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
|
||||
if (this->IsDense()) {
|
||||
CopyDataToEllpack<true>(ctx, batch, feature_types, this, missing);
|
||||
|
||||
@ -85,7 +85,7 @@ template <typename T>
|
||||
bytes += fo->Write(impl->is_dense);
|
||||
bytes += fo->Write(impl->row_stride);
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer;
|
||||
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
|
||||
Context ctx = Context{}.MakeCUDA(curt::CurrentDevice());
|
||||
[[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
|
||||
bytes += common::WriteVec(fo, h_gidx_buffer);
|
||||
bytes += fo->Write(impl->base_rowid);
|
||||
|
||||
@ -202,7 +202,7 @@ EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringVi
|
||||
*/
|
||||
template <typename F>
|
||||
void EllpackPageSourceImpl<F>::Fetch() {
|
||||
common::SetDevice(this->Device().ordinal);
|
||||
curt::SetDevice(this->Device().ordinal);
|
||||
if (!this->ReadCache()) {
|
||||
if (this->count_ != 0 && !this->sync_) {
|
||||
// source is initialized to be the 0th page during construction, so when count_ is 0
|
||||
@ -236,7 +236,7 @@ EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>
|
||||
*/
|
||||
template <typename F>
|
||||
void ExtEllpackPageSourceImpl<F>::Fetch() {
|
||||
common::SetDevice(this->Device().ordinal);
|
||||
curt::SetDevice(this->Device().ordinal);
|
||||
if (!this->ReadCache()) {
|
||||
auto iter = this->source_->Iter();
|
||||
CHECK_EQ(this->count_, iter);
|
||||
|
||||
@ -61,7 +61,7 @@ template <typename S>
|
||||
class EllpackFormatPolicy {
|
||||
std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
|
||||
DeviceOrd device_;
|
||||
bool has_hmm_{common::SupportsPageableMem()};
|
||||
bool has_hmm_{curt::SupportsPageableMem()};
|
||||
|
||||
public:
|
||||
using FormatT = EllpackPageRawFormat;
|
||||
@ -71,7 +71,7 @@ class EllpackFormatPolicy {
|
||||
StringView msg{" The overhead of iterating through external memory might be significant."};
|
||||
if (!has_hmm_) {
|
||||
LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg;
|
||||
} else if (!common::SupportsAts()) {
|
||||
} else if (!curt::SupportsAts()) {
|
||||
LOG(WARNING) << "CUDA address translation service is not available." << msg;
|
||||
}
|
||||
#if !defined(XGBOOST_USE_RMM)
|
||||
@ -121,7 +121,7 @@ class EllpackCacheStreamPolicy : public F<S> {
|
||||
|
||||
template <typename S, template <typename> typename F>
|
||||
class EllpackMmapStreamPolicy : public F<S> {
|
||||
bool has_hmm_{common::SupportsPageableMem()};
|
||||
bool has_hmm_{curt::SupportsPageableMem()};
|
||||
|
||||
public:
|
||||
using WriterT = common::AlignedFileWriteStream;
|
||||
|
||||
@ -64,8 +64,8 @@ void MakeSketches(Context const* ctx,
|
||||
* Get the data shape.
|
||||
*/
|
||||
// We use do while here as the first batch is fetched in ctor
|
||||
CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
|
||||
common::SetDevice(dh::GetDevice(ctx).ordinal);
|
||||
CHECK_LT(ctx->Ordinal(), curt::AllVisibleGPUs());
|
||||
curt::SetDevice(dh::GetDevice(ctx).ordinal);
|
||||
if (ext_info.n_features == 0) {
|
||||
ext_info.n_features = data::BatchColumns(proxy);
|
||||
auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1),
|
||||
@ -124,7 +124,7 @@ void MakeSketches(Context const* ctx,
|
||||
ext_info.base_rows.begin());
|
||||
|
||||
// Get reference
|
||||
common::SetDevice(dh::GetDevice(ctx).ordinal);
|
||||
curt::SetDevice(dh::GetDevice(ctx).ordinal);
|
||||
if (!ref) {
|
||||
HostDeviceVector<FeatureType> ft;
|
||||
common::SketchContainer final_sketch(
|
||||
|
||||
@ -37,7 +37,7 @@ struct GBLinearTrainParam : public XGBoostParameter<GBLinearTrainParam> {
|
||||
size_t max_row_perbatch;
|
||||
|
||||
void CheckGPUSupport() {
|
||||
auto n_gpus = common::AllVisibleGPUs();
|
||||
auto n_gpus = curt::AllVisibleGPUs();
|
||||
if (n_gpus == 0 && this->updater == "gpu_coord_descent") {
|
||||
common::AssertGPUSupport();
|
||||
this->UpdateAllowUnknown(Args{{"updater", "coord_descent"}});
|
||||
|
||||
@ -105,7 +105,7 @@ void GBTree::Configure(Args const& cfg) {
|
||||
}
|
||||
cpu_predictor_->Configure(cfg);
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
auto n_gpus = common::AllVisibleGPUs();
|
||||
auto n_gpus = curt::AllVisibleGPUs();
|
||||
if (!gpu_predictor_) {
|
||||
gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
|
||||
}
|
||||
@ -344,7 +344,7 @@ void GBTree::LoadConfig(Json const& in) {
|
||||
// This would cause all trees to be pushed to trees_to_update
|
||||
// e.g. updating a model, then saving and loading it would result in an empty model
|
||||
tparam_.process_type = TreeProcessType::kDefault;
|
||||
std::int32_t const n_gpus = common::AllVisibleGPUs();
|
||||
std::int32_t const n_gpus = curt::AllVisibleGPUs();
|
||||
|
||||
auto msg = StringView{
|
||||
R"(
|
||||
|
||||
@ -482,7 +482,7 @@ void ExtractPaths(Context const* ctx,
|
||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
|
||||
DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
|
||||
DeviceOrd device) {
|
||||
common::SetDevice(device.ordinal);
|
||||
curt::SetDevice(device.ordinal);
|
||||
auto& device_model = *model;
|
||||
|
||||
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
|
||||
@ -937,7 +937,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
|
||||
|
||||
~GPUPredictor() override {
|
||||
if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
|
||||
if (ctx_->IsCUDA() && ctx_->Ordinal() < curt::AllVisibleGPUs()) {
|
||||
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -184,7 +184,7 @@ struct GPUHistMakerDevice {
|
||||
// Reset values for each update iteration
|
||||
[[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
|
||||
this->monitor.Start(__func__);
|
||||
common::SetDevice(ctx_->Ordinal());
|
||||
curt::SetDevice(ctx_->Ordinal());
|
||||
|
||||
auto const& info = p_fmat->Info();
|
||||
|
||||
@ -789,7 +789,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
// Used in test to count how many configurations are performed
|
||||
LOG(DEBUG) << "[GPU Hist]: Configure";
|
||||
hist_maker_param_.UpdateAllowUnknown(args);
|
||||
common::CheckComputeCapability();
|
||||
curt::CheckComputeCapability();
|
||||
initialised_ = false;
|
||||
|
||||
monitor_.Init("updater_gpu_hist");
|
||||
@ -835,7 +835,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
ctx_, linalg::MakeVec(&column_sampling_seed, sizeof(column_sampling_seed)), 0));
|
||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||
|
||||
common::SetDevice(ctx_->Ordinal());
|
||||
curt::SetDevice(ctx_->Ordinal());
|
||||
p_fmat->Info().feature_types.SetDevice(ctx_->Device());
|
||||
|
||||
std::vector<bst_idx_t> batch_ptr;
|
||||
@ -909,7 +909,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
|
||||
// Used in test to count how many configurations are performed
|
||||
LOG(DEBUG) << "[GPU Approx]: Configure";
|
||||
hist_maker_param_.UpdateAllowUnknown(args);
|
||||
common::CheckComputeCapability();
|
||||
curt::CheckComputeCapability();
|
||||
initialised_ = false;
|
||||
|
||||
monitor_.Init(this->Name());
|
||||
|
||||
@ -94,7 +94,7 @@ class MGPUAllgatherTest : public SocketTest {};
|
||||
} // namespace
|
||||
|
||||
TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
auto n_workers = curt::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker w{host, port, timeout, n_workers, r};
|
||||
@ -105,7 +105,7 @@ TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
|
||||
}
|
||||
|
||||
TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
auto n_workers = curt::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker w{host, port, timeout, n_workers, r};
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/host_vector.h> // for host_vector
|
||||
|
||||
#include "../../../src/common/common.h" // for AllVisibleGPUs
|
||||
#include "../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs
|
||||
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
|
||||
#include "../../../src/common/type.h" // for EraseType
|
||||
#include "test_worker.cuh" // for NCCLWorkerForTest
|
||||
@ -46,7 +46,7 @@ class Worker : public NCCLWorkerForTest {
|
||||
} // namespace
|
||||
|
||||
TEST_F(MGPUAllreduceTest, BitOr) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
auto n_workers = curt::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker w{host, port, timeout, n_workers, r};
|
||||
@ -56,7 +56,7 @@ TEST_F(MGPUAllreduceTest, BitOr) {
|
||||
}
|
||||
|
||||
TEST_F(MGPUAllreduceTest, Sum) {
|
||||
auto n_workers = common::AllVisibleGPUs();
|
||||
auto n_workers = curt::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
Worker w{host, port, timeout, n_workers, r};
|
||||
|
||||
@ -37,7 +37,7 @@ TEST_F(CommGroupTest, Basic) {
|
||||
|
||||
#if defined(XGBOOST_USE_NCCL)
|
||||
TEST_F(CommGroupTest, BasicGPU) {
|
||||
std::int32_t n_workers = common::AllVisibleGPUs();
|
||||
std::int32_t n_workers = curt::AllVisibleGPUs();
|
||||
TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
|
||||
std::int32_t r) {
|
||||
auto ctx = MakeCUDACtx(r);
|
||||
|
||||
@ -205,7 +205,7 @@ class BaseMGPUTest : public ::testing::Test {
|
||||
template <typename Fn>
|
||||
auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
|
||||
[[maybe_unused]] bool emulate_if_single = false) const {
|
||||
auto n_gpus = common::AllVisibleGPUs();
|
||||
auto n_gpus = curt::AllVisibleGPUs();
|
||||
if (is_federated) {
|
||||
#if defined(XGBOOST_USE_FEDERATED)
|
||||
if (n_gpus == 1 && emulate_if_single) {
|
||||
|
||||
@ -3,6 +3,11 @@
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <numeric> // for iota
|
||||
#include <thrust/detail/sequence.inl> // for sequence
|
||||
|
||||
#include "../../../src/common/cuda_rt_utils.h" // for DrVersion
|
||||
#include "../../../src/common/device_helpers.cuh" // for CachingThrustPolicy, PinnedMemory
|
||||
#include "../../../src/common/device_vector.cuh"
|
||||
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
|
||||
|
||||
@ -18,4 +23,96 @@ TEST(DeviceUVector, Basic) {
|
||||
ASSERT_EQ(peak, n_bytes);
|
||||
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
|
||||
}
|
||||
|
||||
#if defined(__linux__)
|
||||
namespace {
|
||||
class TestVirtualMem : public ::testing::TestWithParam<CUmemLocationType> {
|
||||
public:
|
||||
void Run() {
|
||||
auto type = this->GetParam();
|
||||
detail::GrowOnlyVirtualMemVec vec{type};
|
||||
auto prop = xgboost::cudr::MakeAllocProp(type);
|
||||
auto gran = xgboost::cudr::GetAllocGranularity(&prop);
|
||||
ASSERT_GE(gran, 2);
|
||||
auto data = vec.GetSpan<std::int32_t>(32); // should be smaller than granularity
|
||||
ASSERT_EQ(data.size(), 32);
|
||||
static_assert(std::is_same_v<typename decltype(data)::value_type, std::int32_t>);
|
||||
|
||||
std::vector<std::int32_t> h_data(data.size());
|
||||
auto check = [&] {
|
||||
for (std::size_t i = 0; i < h_data.size(); ++i) {
|
||||
ASSERT_EQ(h_data[i], i);
|
||||
}
|
||||
};
|
||||
auto fill = [&](std::int32_t n_orig, xgboost::common::Span<std::int32_t> data) {
|
||||
if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
|
||||
thrust::sequence(dh::CachingThrustPolicy(), data.data() + n_orig, data.data() + data.size(),
|
||||
n_orig);
|
||||
dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault));
|
||||
} else {
|
||||
std::iota(data.data() + n_orig, data.data() + data.size(), n_orig);
|
||||
std::copy_n(data.data(), data.size(), h_data.data());
|
||||
}
|
||||
};
|
||||
|
||||
fill(0, data);
|
||||
check();
|
||||
|
||||
auto n_orig = data.size();
|
||||
// Should be smaller than granularity, use already reserved.
|
||||
data = vec.GetSpan<std::int32_t>(128);
|
||||
h_data.resize(data.size());
|
||||
fill(n_orig, data);
|
||||
check();
|
||||
if (128 < gran) {
|
||||
ASSERT_EQ(vec.Capacity(), gran);
|
||||
}
|
||||
|
||||
n_orig = data.size();
|
||||
data = vec.GetSpan<std::int32_t>(gran / 2);
|
||||
h_data.resize(data.size());
|
||||
fill(n_orig, data);
|
||||
check();
|
||||
ASSERT_EQ(vec.Capacity(), gran * 2);
|
||||
|
||||
n_orig = data.size();
|
||||
data = vec.GetSpan<std::int32_t>(gran);
|
||||
h_data.resize(data.size());
|
||||
fill(n_orig, data);
|
||||
check();
|
||||
ASSERT_EQ(vec.Capacity(), gran * 4);
|
||||
}
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_P(TestVirtualMem, Alloc) { this->Run(); }
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
Basic, TestVirtualMem,
|
||||
::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA),
|
||||
[](::testing::TestParamInfo<TestVirtualMem::ParamType> const& info) -> char const* {
|
||||
auto type = info.param;
|
||||
switch (type) {
|
||||
case CU_MEM_LOCATION_TYPE_DEVICE:
|
||||
return "Device";
|
||||
case CU_MEM_LOCATION_TYPE_HOST_NUMA:
|
||||
return "HostNuma";
|
||||
default:
|
||||
LOG(FATAL) << "unreachable";
|
||||
}
|
||||
return nullptr;
|
||||
});
|
||||
#endif // defined(__linux__)
|
||||
|
||||
TEST(TestVirtualMem, Version) {
|
||||
std::int32_t major, minor;
|
||||
xgboost::curt::DrVersion(&major, &minor);
|
||||
LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor;
|
||||
PinnedMemory pinned;
|
||||
if (major >= 12 && minor >= 5) {
|
||||
ASSERT_TRUE(pinned.IsVm());
|
||||
} else {
|
||||
ASSERT_FALSE(pinned.IsVm());
|
||||
}
|
||||
}
|
||||
} // namespace dh
|
||||
|
||||
@ -578,7 +578,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
|
||||
|
||||
namespace {
|
||||
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
|
||||
common::SetDevice(ctx->Ordinal());
|
||||
curt::SetDevice(ctx->Ordinal());
|
||||
auto n = n_samples * n_features;
|
||||
std::vector<float> x;
|
||||
x.resize(n);
|
||||
|
||||
@ -100,7 +100,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
|
||||
}
|
||||
|
||||
void TestHostDeviceVector(size_t n, DeviceOrd device) {
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
|
||||
HostDeviceVector<int> v;
|
||||
InitHostDeviceVector(n, device, &v);
|
||||
CheckDevice(&v, n, 0, GPUAccess::kRead);
|
||||
@ -119,7 +119,7 @@ TEST(HostDeviceVector, Basic) {
|
||||
TEST(HostDeviceVector, Copy) {
|
||||
size_t n = 1001;
|
||||
auto device = DeviceOrd::CUDA(0);
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
|
||||
|
||||
HostDeviceVector<int> v;
|
||||
{
|
||||
|
||||
@ -72,7 +72,7 @@ TEST_P(TestEllpackPageRawFormat, DiskIO) {
|
||||
}
|
||||
|
||||
TEST_P(TestEllpackPageRawFormat, DiskIOHmm) {
|
||||
if (common::SupportsPageableMem()) {
|
||||
if (curt::SupportsPageableMem()) {
|
||||
EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{true};
|
||||
this->Run(&policy, this->GetParam());
|
||||
} else {
|
||||
|
||||
@ -655,7 +655,7 @@ class RMMAllocator {
|
||||
std::vector<std::unique_ptr<CUDAMemoryResource>> cuda_mr;
|
||||
std::vector<std::unique_ptr<PoolMemoryResource>> pool_mr;
|
||||
int n_gpu;
|
||||
RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
|
||||
RMMAllocator() : n_gpu(curt::AllVisibleGPUs()) {
|
||||
int current_device;
|
||||
CHECK_EQ(cudaGetDevice(¤t_device), cudaSuccess);
|
||||
for (int i = 0; i < n_gpu; ++i) {
|
||||
@ -697,5 +697,5 @@ void DeleteRMMResource(RMMAllocator*) {}
|
||||
RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
|
||||
#endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
|
||||
|
||||
std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
|
||||
std::int32_t DistGpuIdx() { return curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
|
||||
} // namespace xgboost
|
||||
|
||||
@ -34,7 +34,7 @@
|
||||
#endif
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
|
||||
#define GPUIDX (curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
|
||||
#else
|
||||
#define GPUIDX (-1)
|
||||
#endif
|
||||
|
||||
@ -47,7 +47,7 @@ class TestDistributedMetric : public ::testing::TestWithParam<Param> {
|
||||
|
||||
std::int32_t n_workers{0};
|
||||
if (device.IsCUDA()) {
|
||||
n_workers = common::AllVisibleGPUs();
|
||||
n_workers = curt::AllVisibleGPUs();
|
||||
} else {
|
||||
n_workers = std::min(static_cast<std::int32_t>(std::thread::hardware_concurrency()), 3);
|
||||
}
|
||||
|
||||
@ -102,14 +102,14 @@ void TestAllgatherV(std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
|
||||
} // namespace
|
||||
|
||||
TEST_F(FederatedCollTestGPU, Allreduce) {
|
||||
std::int32_t n_workers = common::AllVisibleGPUs();
|
||||
std::int32_t n_workers = curt::AllVisibleGPUs();
|
||||
TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
|
||||
TestAllreduce(comm, rank, n_workers);
|
||||
});
|
||||
}
|
||||
|
||||
TEST(FederatedCollGPUGlobal, Allreduce) {
|
||||
std::int32_t n_workers = common::AllVisibleGPUs();
|
||||
std::int32_t n_workers = curt::AllVisibleGPUs();
|
||||
TestFederatedGlobal(n_workers, [&] {
|
||||
auto r = collective::GetRank();
|
||||
auto world = collective::GetWorldSize();
|
||||
@ -135,14 +135,14 @@ TEST(FederatedCollGPUGlobal, Allreduce) {
|
||||
}
|
||||
|
||||
TEST_F(FederatedCollTestGPU, Broadcast) {
|
||||
std::int32_t n_workers = common::AllVisibleGPUs();
|
||||
std::int32_t n_workers = curt::AllVisibleGPUs();
|
||||
TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
|
||||
TestBroadcast(comm, rank);
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(FederatedCollTestGPU, Allgather) {
|
||||
std::int32_t n_workers = common::AllVisibleGPUs();
|
||||
std::int32_t n_workers = curt::AllVisibleGPUs();
|
||||
TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
|
||||
TestAllgather(comm, rank, n_workers);
|
||||
});
|
||||
@ -150,7 +150,7 @@ TEST_F(FederatedCollTestGPU, Allgather) {
|
||||
|
||||
TEST_F(FederatedCollTestGPU, AllgatherV) {
|
||||
std::int32_t n_workers = 2;
|
||||
if (common::AllVisibleGPUs() < n_workers) {
|
||||
if (curt::AllVisibleGPUs() < n_workers) {
|
||||
GTEST_SKIP_("At least 2 GPUs are required for the test.");
|
||||
}
|
||||
TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
|
||||
namespace xgboost::collective {
|
||||
TEST(CommGroup, Federated) {
|
||||
std::int32_t n_workers = common::AllVisibleGPUs();
|
||||
std::int32_t n_workers = curt::AllVisibleGPUs();
|
||||
TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
|
||||
Context ctx;
|
||||
ASSERT_EQ(comm_group->Rank(), r);
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
|
||||
namespace xgboost::collective {
|
||||
TEST(CommGroup, FederatedGPU) {
|
||||
std::int32_t n_workers = common::AllVisibleGPUs();
|
||||
std::int32_t n_workers = curt::AllVisibleGPUs();
|
||||
TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
|
||||
Context ctx = MakeCUDACtx(0);
|
||||
auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CUDA(0));
|
||||
|
||||
@ -299,7 +299,7 @@ TEST(GPUPredictor, IterationRange) {
|
||||
}
|
||||
|
||||
TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
|
||||
TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true);
|
||||
TestIterationRangeColumnSplit(curt::AllVisibleGPUs(), true);
|
||||
}
|
||||
|
||||
TEST(GPUPredictor, CategoricalPrediction) {
|
||||
@ -312,7 +312,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
|
||||
}
|
||||
|
||||
TEST(GPUPredictor, CategoricalPredictLeaf) {
|
||||
auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
|
||||
auto ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
|
||||
TestCategoricalPredictLeaf(&ctx, false);
|
||||
}
|
||||
|
||||
@ -358,7 +358,7 @@ TEST(GPUPredictor, Sparse) {
|
||||
}
|
||||
|
||||
TEST_F(MGPUPredictorTest, SparseColumnSplit) {
|
||||
TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2);
|
||||
TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8);
|
||||
TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.2);
|
||||
TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.8);
|
||||
}
|
||||
} // namespace xgboost::predictor
|
||||
|
||||
@ -320,7 +320,7 @@ void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
|
||||
auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
|
||||
Context ctx;
|
||||
if (use_gpu) {
|
||||
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
|
||||
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
|
||||
}
|
||||
auto learner = LearnerForTest(&ctx, m_train, kIters);
|
||||
auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
|
||||
@ -354,7 +354,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
|
||||
void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
|
||||
Context ctx;
|
||||
if (use_gpu) {
|
||||
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
|
||||
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
|
||||
}
|
||||
size_t constexpr kCols = 10;
|
||||
PredictionCacheEntry out_predictions;
|
||||
@ -507,7 +507,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
|
||||
auto const rank = collective::GetRank();
|
||||
Context ctx;
|
||||
if (use_gpu) {
|
||||
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
|
||||
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
|
||||
}
|
||||
auto n_threads = collective::GetWorkerLocalThreads(world_size);
|
||||
ctx.UpdateAllowUnknown(
|
||||
@ -679,7 +679,7 @@ void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::siz
|
||||
std::vector<float> const &expected_predt) {
|
||||
Context ctx;
|
||||
if (use_gpu) {
|
||||
ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
|
||||
ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
|
||||
}
|
||||
auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
|
||||
std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
|
||||
|
||||
@ -30,7 +30,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
|
||||
|
||||
TEST(Context, DeviceOrdinal) {
|
||||
Context ctx;
|
||||
auto n_vis = common::AllVisibleGPUs();
|
||||
auto n_vis = curt::AllVisibleGPUs();
|
||||
auto ord = n_vis - 1;
|
||||
|
||||
std::string device = "cuda:" + std::to_string(ord);
|
||||
@ -82,7 +82,7 @@ TEST(Context, GPUId) {
|
||||
ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
|
||||
TestCUDA(ctx, 0);
|
||||
|
||||
auto n_vis = common::AllVisibleGPUs();
|
||||
auto n_vis = curt::AllVisibleGPUs();
|
||||
auto ord = n_vis - 1;
|
||||
ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
|
||||
TestCUDA(ctx, ord);
|
||||
|
||||
@ -759,7 +759,7 @@ void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args
|
||||
|
||||
auto world_size{3};
|
||||
if (use_gpu) {
|
||||
world_size = common::AllVisibleGPUs();
|
||||
world_size = curt::AllVisibleGPUs();
|
||||
// Simulate MPU on a single GPU. Federated doesn't use nccl, can run multiple
|
||||
// instances on the same GPU.
|
||||
if (world_size == 1 && federated) {
|
||||
|
||||
@ -595,7 +595,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
|
||||
if (common::AllVisibleGPUs() > 1) {
|
||||
if (curt::AllVisibleGPUs() > 1) {
|
||||
// We can't emulate multiple GPUs with NCCL.
|
||||
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, false, true);
|
||||
}
|
||||
@ -603,7 +603,7 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
|
||||
}
|
||||
|
||||
TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
|
||||
if (common::AllVisibleGPUs() > 1) {
|
||||
if (curt::AllVisibleGPUs() > 1) {
|
||||
// We can't emulate multiple GPUs with NCCL.
|
||||
this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, false, true);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user