Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator. - Define a driver API wrapper. Split up the runtime API wrapper.
2024-09-28 04:26:44 +08:00
parent 13b9874fd6
commit 271f4a80e7
43 changed files with 702 additions and 103 deletions
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -3,6 +3,11 @@
 */
 #include <gtest/gtest.h>

+#include <numeric>                     // for iota
+#include <thrust/detail/sequence.inl>  // for sequence
+
+#include "../../../src/common/cuda_rt_utils.h"     // for DrVersion
+#include "../../../src/common/device_helpers.cuh"  // for CachingThrustPolicy, PinnedMemory
 #include "../../../src/common/device_vector.cuh"
 #include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore

@@ -18,4 +23,96 @@ TEST(DeviceUVector, Basic) {
  ASSERT_EQ(peak, n_bytes);
  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
 }
+
+#if defined(__linux__)
+namespace {
+class TestVirtualMem : public ::testing::TestWithParam<CUmemLocationType> {
+ public:
+  void Run() {
+    auto type = this->GetParam();
+    detail::GrowOnlyVirtualMemVec vec{type};
+    auto prop = xgboost::cudr::MakeAllocProp(type);
+    auto gran = xgboost::cudr::GetAllocGranularity(&prop);
+    ASSERT_GE(gran, 2);
+    auto data = vec.GetSpan<std::int32_t>(32);  // should be smaller than granularity
+    ASSERT_EQ(data.size(), 32);
+    static_assert(std::is_same_v<typename decltype(data)::value_type, std::int32_t>);
+
+    std::vector<std::int32_t> h_data(data.size());
+    auto check = [&] {
+      for (std::size_t i = 0; i < h_data.size(); ++i) {
+        ASSERT_EQ(h_data[i], i);
+      }
+    };
+    auto fill = [&](std::int32_t n_orig, xgboost::common::Span<std::int32_t> data) {
+      if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
+        thrust::sequence(dh::CachingThrustPolicy(), data.data() + n_orig, data.data() + data.size(),
+                         n_orig);
+        dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault));
+      } else {
+        std::iota(data.data() + n_orig, data.data() + data.size(), n_orig);
+        std::copy_n(data.data(), data.size(), h_data.data());
+      }
+    };
+
+    fill(0, data);
+    check();
+
+    auto n_orig = data.size();
+    // Should be smaller than granularity, use already reserved.
+    data = vec.GetSpan<std::int32_t>(128);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    if (128 < gran) {
+      ASSERT_EQ(vec.Capacity(), gran);
+    }
+
+    n_orig = data.size();
+    data = vec.GetSpan<std::int32_t>(gran / 2);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    ASSERT_EQ(vec.Capacity(), gran * 2);
+
+    n_orig = data.size();
+    data = vec.GetSpan<std::int32_t>(gran);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    ASSERT_EQ(vec.Capacity(), gran * 4);
+  }
+};
+}  // anonymous namespace
+
+TEST_P(TestVirtualMem, Alloc) { this->Run(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    Basic, TestVirtualMem,
+    ::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA),
+    [](::testing::TestParamInfo<TestVirtualMem::ParamType> const& info) -> char const* {
+      auto type = info.param;
+      switch (type) {
+        case CU_MEM_LOCATION_TYPE_DEVICE:
+          return "Device";
+        case CU_MEM_LOCATION_TYPE_HOST_NUMA:
+          return "HostNuma";
+        default:
+          LOG(FATAL) << "unreachable";
+      }
+      return nullptr;
+    });
+#endif  // defined(__linux__)
+
+TEST(TestVirtualMem, Version) {
+  std::int32_t major, minor;
+  xgboost::curt::DrVersion(&major, &minor);
+  LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor;
+  PinnedMemory pinned;
+  if (major >= 12 && minor >= 5) {
+    ASSERT_TRUE(pinned.IsVm());
+  } else {
+    ASSERT_FALSE(pinned.IsVm());
+  }
+}
 }  // namespace dh
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -578,7 +578,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {

 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  auto n = n_samples * n_features;
  std::vector<float> x;
  x.resize(n);
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -100,7 +100,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
 }

 void TestHostDeviceVector(size_t n, DeviceOrd device) {
-  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
  HostDeviceVector<int> v;
  InitHostDeviceVector(n, device, &v);
  CheckDevice(&v, n, 0, GPUAccess::kRead);
@@ -119,7 +119,7 @@ TEST(HostDeviceVector, Basic) {
 TEST(HostDeviceVector, Copy) {
  size_t n = 1001;
  auto device = DeviceOrd::CUDA(0);
-  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);

  HostDeviceVector<int> v;
  {