Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator.
- Define a driver API wrapper. Split up the runtime API wrapper.
This commit is contained in:
Jiaming Yuan
2024-09-28 04:26:44 +08:00
committed by GitHub
parent 13b9874fd6
commit 271f4a80e7
43 changed files with 702 additions and 103 deletions

View File

@@ -94,7 +94,7 @@ class MGPUAllgatherTest : public SocketTest {};
} // namespace
TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
auto n_workers = common::AllVisibleGPUs();
auto n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
Worker w{host, port, timeout, n_workers, r};
@@ -105,7 +105,7 @@ TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
}
TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
auto n_workers = common::AllVisibleGPUs();
auto n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
Worker w{host, port, timeout, n_workers, r};

View File

@@ -5,7 +5,7 @@
#include <gtest/gtest.h>
#include <thrust/host_vector.h> // for host_vector
#include "../../../src/common/common.h" // for AllVisibleGPUs
#include "../../../src/common/cuda_rt_utils.h" // for AllVisibleGPUs
#include "../../../src/common/device_helpers.cuh" // for ToSpan, device_vector
#include "../../../src/common/type.h" // for EraseType
#include "test_worker.cuh" // for NCCLWorkerForTest
@@ -46,7 +46,7 @@ class Worker : public NCCLWorkerForTest {
} // namespace
TEST_F(MGPUAllreduceTest, BitOr) {
auto n_workers = common::AllVisibleGPUs();
auto n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
Worker w{host, port, timeout, n_workers, r};
@@ -56,7 +56,7 @@ TEST_F(MGPUAllreduceTest, BitOr) {
}
TEST_F(MGPUAllreduceTest, Sum) {
auto n_workers = common::AllVisibleGPUs();
auto n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
Worker w{host, port, timeout, n_workers, r};

View File

@@ -37,7 +37,7 @@ TEST_F(CommGroupTest, Basic) {
#if defined(XGBOOST_USE_NCCL)
TEST_F(CommGroupTest, BasicGPU) {
std::int32_t n_workers = common::AllVisibleGPUs();
std::int32_t n_workers = curt::AllVisibleGPUs();
TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
std::int32_t r) {
auto ctx = MakeCUDACtx(r);

View File

@@ -205,7 +205,7 @@ class BaseMGPUTest : public ::testing::Test {
template <typename Fn>
auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
[[maybe_unused]] bool emulate_if_single = false) const {
auto n_gpus = common::AllVisibleGPUs();
auto n_gpus = curt::AllVisibleGPUs();
if (is_federated) {
#if defined(XGBOOST_USE_FEDERATED)
if (n_gpus == 1 && emulate_if_single) {