Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator. - Define a driver API wrapper. Split up the runtime API wrapper.
2024-09-28 04:26:44 +08:00
parent 13b9874fd6
commit 271f4a80e7
43 changed files with 702 additions and 103 deletions
--- a/tests/cpp/collective/test_allgather.cu
+++ b/tests/cpp/collective/test_allgather.cu
@@ -94,7 +94,7 @@ class MGPUAllgatherTest : public SocketTest {};
 }  // namespace

 TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
@@ -105,7 +105,7 @@ TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
 }

 TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
--- a/tests/cpp/collective/test_allreduce.cu
+++ b/tests/cpp/collective/test_allreduce.cu
@@ -5,7 +5,7 @@
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>  // for host_vector

-#include "../../../src/common/common.h"            // for AllVisibleGPUs
+#include "../../../src/common/cuda_rt_utils.h"     // for AllVisibleGPUs
 #include "../../../src/common/device_helpers.cuh"  // for ToSpan,  device_vector
 #include "../../../src/common/type.h"              // for EraseType
 #include "test_worker.cuh"                         // for NCCLWorkerForTest
@@ -46,7 +46,7 @@ class Worker : public NCCLWorkerForTest {
 }  // namespace

 TEST_F(MGPUAllreduceTest, BitOr) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
@@ -56,7 +56,7 @@ TEST_F(MGPUAllreduceTest, BitOr) {
 }

 TEST_F(MGPUAllreduceTest, Sum) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
--- a/tests/cpp/collective/test_comm_group.cc
+++ b/tests/cpp/collective/test_comm_group.cc
@@ -37,7 +37,7 @@ TEST_F(CommGroupTest, Basic) {

 #if defined(XGBOOST_USE_NCCL)
 TEST_F(CommGroupTest, BasicGPU) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    auto ctx = MakeCUDACtx(r);
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -205,7 +205,7 @@ class BaseMGPUTest : public ::testing::Test {
  template <typename Fn>
  auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
              [[maybe_unused]] bool emulate_if_single = false) const {
-    auto n_gpus = common::AllVisibleGPUs();
+    auto n_gpus = curt::AllVisibleGPUs();
    if (is_federated) {
 #if defined(XGBOOST_USE_FEDERATED)
      if (n_gpus == 1 && emulate_if_single) {