Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator. - Define a driver API wrapper. Split up the runtime API wrapper.
2024-09-28 04:26:44 +08:00
parent 13b9874fd6
commit 271f4a80e7
43 changed files with 702 additions and 103 deletions
--- a/tests/cpp/collective/test_allgather.cu
+++ b/tests/cpp/collective/test_allgather.cu
@@ -94,7 +94,7 @@ class MGPUAllgatherTest : public SocketTest {};
 }  // namespace

 TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
@@ -105,7 +105,7 @@ TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
 }

 TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
--- a/tests/cpp/collective/test_allreduce.cu
+++ b/tests/cpp/collective/test_allreduce.cu
@@ -5,7 +5,7 @@
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>  // for host_vector

-#include "../../../src/common/common.h"            // for AllVisibleGPUs
+#include "../../../src/common/cuda_rt_utils.h"     // for AllVisibleGPUs
 #include "../../../src/common/device_helpers.cuh"  // for ToSpan,  device_vector
 #include "../../../src/common/type.h"              // for EraseType
 #include "test_worker.cuh"                         // for NCCLWorkerForTest
@@ -46,7 +46,7 @@ class Worker : public NCCLWorkerForTest {
 }  // namespace

 TEST_F(MGPUAllreduceTest, BitOr) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
@@ -56,7 +56,7 @@ TEST_F(MGPUAllreduceTest, BitOr) {
 }

 TEST_F(MGPUAllreduceTest, Sum) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
--- a/tests/cpp/collective/test_comm_group.cc
+++ b/tests/cpp/collective/test_comm_group.cc
@@ -37,7 +37,7 @@ TEST_F(CommGroupTest, Basic) {

 #if defined(XGBOOST_USE_NCCL)
 TEST_F(CommGroupTest, BasicGPU) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    auto ctx = MakeCUDACtx(r);
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -205,7 +205,7 @@ class BaseMGPUTest : public ::testing::Test {
  template <typename Fn>
  auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
              [[maybe_unused]] bool emulate_if_single = false) const {
-    auto n_gpus = common::AllVisibleGPUs();
+    auto n_gpus = curt::AllVisibleGPUs();
    if (is_federated) {
 #if defined(XGBOOST_USE_FEDERATED)
      if (n_gpus == 1 && emulate_if_single) {
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -3,6 +3,11 @@
 */
 #include <gtest/gtest.h>

+#include <numeric>                     // for iota
+#include <thrust/detail/sequence.inl>  // for sequence
+
+#include "../../../src/common/cuda_rt_utils.h"     // for DrVersion
+#include "../../../src/common/device_helpers.cuh"  // for CachingThrustPolicy, PinnedMemory
 #include "../../../src/common/device_vector.cuh"
 #include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore

@@ -18,4 +23,96 @@ TEST(DeviceUVector, Basic) {
  ASSERT_EQ(peak, n_bytes);
  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
 }
+
+#if defined(__linux__)
+namespace {
+class TestVirtualMem : public ::testing::TestWithParam<CUmemLocationType> {
+ public:
+  void Run() {
+    auto type = this->GetParam();
+    detail::GrowOnlyVirtualMemVec vec{type};
+    auto prop = xgboost::cudr::MakeAllocProp(type);
+    auto gran = xgboost::cudr::GetAllocGranularity(&prop);
+    ASSERT_GE(gran, 2);
+    auto data = vec.GetSpan<std::int32_t>(32);  // should be smaller than granularity
+    ASSERT_EQ(data.size(), 32);
+    static_assert(std::is_same_v<typename decltype(data)::value_type, std::int32_t>);
+
+    std::vector<std::int32_t> h_data(data.size());
+    auto check = [&] {
+      for (std::size_t i = 0; i < h_data.size(); ++i) {
+        ASSERT_EQ(h_data[i], i);
+      }
+    };
+    auto fill = [&](std::int32_t n_orig, xgboost::common::Span<std::int32_t> data) {
+      if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
+        thrust::sequence(dh::CachingThrustPolicy(), data.data() + n_orig, data.data() + data.size(),
+                         n_orig);
+        dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault));
+      } else {
+        std::iota(data.data() + n_orig, data.data() + data.size(), n_orig);
+        std::copy_n(data.data(), data.size(), h_data.data());
+      }
+    };
+
+    fill(0, data);
+    check();
+
+    auto n_orig = data.size();
+    // Should be smaller than granularity, use already reserved.
+    data = vec.GetSpan<std::int32_t>(128);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    if (128 < gran) {
+      ASSERT_EQ(vec.Capacity(), gran);
+    }
+
+    n_orig = data.size();
+    data = vec.GetSpan<std::int32_t>(gran / 2);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    ASSERT_EQ(vec.Capacity(), gran * 2);
+
+    n_orig = data.size();
+    data = vec.GetSpan<std::int32_t>(gran);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    ASSERT_EQ(vec.Capacity(), gran * 4);
+  }
+};
+}  // anonymous namespace
+
+TEST_P(TestVirtualMem, Alloc) { this->Run(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    Basic, TestVirtualMem,
+    ::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA),
+    [](::testing::TestParamInfo<TestVirtualMem::ParamType> const& info) -> char const* {
+      auto type = info.param;
+      switch (type) {
+        case CU_MEM_LOCATION_TYPE_DEVICE:
+          return "Device";
+        case CU_MEM_LOCATION_TYPE_HOST_NUMA:
+          return "HostNuma";
+        default:
+          LOG(FATAL) << "unreachable";
+      }
+      return nullptr;
+    });
+#endif  // defined(__linux__)
+
+TEST(TestVirtualMem, Version) {
+  std::int32_t major, minor;
+  xgboost::curt::DrVersion(&major, &minor);
+  LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor;
+  PinnedMemory pinned;
+  if (major >= 12 && minor >= 5) {
+    ASSERT_TRUE(pinned.IsVm());
+  } else {
+    ASSERT_FALSE(pinned.IsVm());
+  }
+}
 }  // namespace dh
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -578,7 +578,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {

 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  auto n = n_samples * n_features;
  std::vector<float> x;
  x.resize(n);
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -100,7 +100,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
 }

 void TestHostDeviceVector(size_t n, DeviceOrd device) {
-  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
  HostDeviceVector<int> v;
  InitHostDeviceVector(n, device, &v);
  CheckDevice(&v, n, 0, GPUAccess::kRead);
@@ -119,7 +119,7 @@ TEST(HostDeviceVector, Basic) {
 TEST(HostDeviceVector, Copy) {
  size_t n = 1001;
  auto device = DeviceOrd::CUDA(0);
-  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);

  HostDeviceVector<int> v;
  {
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -72,7 +72,7 @@ TEST_P(TestEllpackPageRawFormat, DiskIO) {
 }

 TEST_P(TestEllpackPageRawFormat, DiskIOHmm) {
-  if (common::SupportsPageableMem()) {
+  if (curt::SupportsPageableMem()) {
    EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{true};
    this->Run(&policy, this->GetParam());
  } else {
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -655,7 +655,7 @@ class RMMAllocator {
  std::vector<std::unique_ptr<CUDAMemoryResource>> cuda_mr;
  std::vector<std::unique_ptr<PoolMemoryResource>> pool_mr;
  int n_gpu;
-  RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
+  RMMAllocator() : n_gpu(curt::AllVisibleGPUs()) {
    int current_device;
    CHECK_EQ(cudaGetDevice(&current_device), cudaSuccess);
    for (int i = 0; i < n_gpu; ++i) {
@@ -697,5 +697,5 @@ void DeleteRMMResource(RMMAllocator*) {}
 RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1

-std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
+std::int32_t DistGpuIdx() { return curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
 } // namespace xgboost
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -34,7 +34,7 @@
 #endif

 #if defined(__CUDACC__)
-#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
+#define GPUIDX (curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
 #else
 #define GPUIDX (-1)
 #endif
--- a/tests/cpp/metric/test_distributed_metric.cc
+++ b/tests/cpp/metric/test_distributed_metric.cc
@@ -47,7 +47,7 @@ class TestDistributedMetric : public ::testing::TestWithParam<Param> {

    std::int32_t n_workers{0};
    if (device.IsCUDA()) {
-      n_workers = common::AllVisibleGPUs();
+      n_workers = curt::AllVisibleGPUs();
    } else {
      n_workers = std::min(static_cast<std::int32_t>(std::thread::hardware_concurrency()), 3);
    }
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@@ -102,14 +102,14 @@ void TestAllgatherV(std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
 }  // namespace

 TEST_F(FederatedCollTestGPU, Allreduce) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestAllreduce(comm, rank, n_workers);
  });
 }

 TEST(FederatedCollGPUGlobal, Allreduce) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGlobal(n_workers, [&] {
    auto r = collective::GetRank();
    auto world = collective::GetWorldSize();
@@ -135,14 +135,14 @@ TEST(FederatedCollGPUGlobal, Allreduce) {
 }

 TEST_F(FederatedCollTestGPU, Broadcast) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestBroadcast(comm, rank);
  });
 }

 TEST_F(FederatedCollTestGPU, Allgather) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestAllgather(comm, rank, n_workers);
  });
@@ -150,7 +150,7 @@ TEST_F(FederatedCollTestGPU, Allgather) {

 TEST_F(FederatedCollTestGPU, AllgatherV) {
  std::int32_t n_workers = 2;
-  if (common::AllVisibleGPUs() < n_workers) {
+  if (curt::AllVisibleGPUs() < n_workers) {
    GTEST_SKIP_("At least 2 GPUs are required for the test.");
  }
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc
@@ -10,7 +10,7 @@

 namespace xgboost::collective {
 TEST(CommGroup, Federated) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
    Context ctx;
    ASSERT_EQ(comm_group->Rank(), r);
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cu
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu
@@ -11,7 +11,7 @@

 namespace xgboost::collective {
 TEST(CommGroup, FederatedGPU) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
    Context ctx = MakeCUDACtx(0);
    auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CUDA(0));
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -299,7 +299,7 @@ TEST(GPUPredictor, IterationRange) {
 }

 TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
-  TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true);
+  TestIterationRangeColumnSplit(curt::AllVisibleGPUs(), true);
 }

 TEST(GPUPredictor, CategoricalPrediction) {
@@ -312,7 +312,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
 }

 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  auto ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  TestCategoricalPredictLeaf(&ctx, false);
 }

@@ -358,7 +358,7 @@ TEST(GPUPredictor, Sparse) {
 }

 TEST_F(MGPUPredictorTest, SparseColumnSplit) {
-  TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2);
-  TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8);
+  TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.2);
+  TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.8);
 }
 }  // namespace xgboost::predictor
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -320,7 +320,7 @@ void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
  }
  auto learner = LearnerForTest(&ctx, m_train, kIters);
  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
@@ -354,7 +354,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
 void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  }
  size_t constexpr kCols = 10;
  PredictionCacheEntry out_predictions;
@@ -507,7 +507,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
  auto const rank = collective::GetRank();
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
  }
  auto n_threads = collective::GetWorkerLocalThreads(world_size);
  ctx.UpdateAllowUnknown(
@@ -679,7 +679,7 @@ void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::siz
                                       std::vector<float> const &expected_predt) {
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  }
  auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
  std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -30,7 +30,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {

 TEST(Context, DeviceOrdinal) {
  Context ctx;
-  auto n_vis = common::AllVisibleGPUs();
+  auto n_vis = curt::AllVisibleGPUs();
  auto ord = n_vis - 1;

  std::string device = "cuda:" + std::to_string(ord);
@@ -82,7 +82,7 @@ TEST(Context, GPUId) {
  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
  TestCUDA(ctx, 0);

-  auto n_vis = common::AllVisibleGPUs();
+  auto n_vis = curt::AllVisibleGPUs();
  auto ord = n_vis - 1;
  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
  TestCUDA(ctx, ord);
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -759,7 +759,7 @@ void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args

  auto world_size{3};
  if (use_gpu) {
-    world_size = common::AllVisibleGPUs();
+    world_size = curt::AllVisibleGPUs();
    // Simulate MPU on a single GPU. Federated doesn't use nccl, can run multiple
    // instances on the same GPU.
    if (world_size == 1 && federated) {
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -595,7 +595,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
 }  // anonymous namespace

 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
-  if (common::AllVisibleGPUs() > 1) {
+  if (curt::AllVisibleGPUs() > 1) {
    // We can't emulate multiple GPUs with NCCL.
    this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, false, true);
  }
@@ -603,7 +603,7 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
 }

 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
-  if (common::AllVisibleGPUs() > 1) {
+  if (curt::AllVisibleGPUs() > 1) {
    // We can't emulate multiple GPUs with NCCL.
    this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, false, true);
  }