Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator. - Define a driver API wrapper. Split up the runtime API wrapper.
2024-09-28 04:26:44 +08:00
parent 13b9874fd6
commit 271f4a80e7
43 changed files with 702 additions and 103 deletions
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@@ -102,14 +102,14 @@ void TestAllgatherV(std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
 }  // namespace

 TEST_F(FederatedCollTestGPU, Allreduce) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestAllreduce(comm, rank, n_workers);
  });
 }

 TEST(FederatedCollGPUGlobal, Allreduce) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGlobal(n_workers, [&] {
    auto r = collective::GetRank();
    auto world = collective::GetWorldSize();
@@ -135,14 +135,14 @@ TEST(FederatedCollGPUGlobal, Allreduce) {
 }

 TEST_F(FederatedCollTestGPU, Broadcast) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestBroadcast(comm, rank);
  });
 }

 TEST_F(FederatedCollTestGPU, Allgather) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestAllgather(comm, rank, n_workers);
  });
@@ -150,7 +150,7 @@ TEST_F(FederatedCollTestGPU, Allgather) {

 TEST_F(FederatedCollTestGPU, AllgatherV) {
  std::int32_t n_workers = 2;
-  if (common::AllVisibleGPUs() < n_workers) {
+  if (curt::AllVisibleGPUs() < n_workers) {
    GTEST_SKIP_("At least 2 GPUs are required for the test.");
  }
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc
@@ -10,7 +10,7 @@

 namespace xgboost::collective {
 TEST(CommGroup, Federated) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
    Context ctx;
    ASSERT_EQ(comm_group->Rank(), r);
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cu
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu
@@ -11,7 +11,7 @@

 namespace xgboost::collective {
 TEST(CommGroup, FederatedGPU) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
    Context ctx = MakeCUDACtx(0);
    auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CUDA(0));