Use CUDA virtual memory for pinned memory allocation. (#10850)

- Add a grow-only virtual memory allocator. - Define a driver API wrapper. Split up the runtime API wrapper.
2024-09-28 04:26:44 +08:00 · 2024-09-28 04:26:44 +08:00 · 271f4a80e7
commit 271f4a80e7
parent 13b9874fd6
43 changed files with 702 additions and 103 deletions
--- a/src/common/cuda_dr_utils.cc
+++ b/src/common/cuda_dr_utils.cc
@ -0,0 +1,108 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#if defined(XGBOOST_USE_CUDA)
+#include "cuda_dr_utils.h"
+
+#include <algorithm>  // for max
+#include <cstdint>    // for int32_t
+#include <cstring>    // for memset
+#include <memory>     // for make_unique
+#include <mutex>      // for call_once
+#include <sstream>    // for stringstream
+#include <string>     // for string
+
+#include "common.h"               // for safe_cuda
+#include "cuda_rt_utils.h"        // for CurrentDevice
+#include "xgboost/string_view.h"  // for StringVie
+
+namespace xgboost::cudr {
+CuDriverApi::CuDriverApi() {
+  // similar to dlopen, but without the need to release a handle.
+  auto safe_load = [](xgboost::StringView name, auto **fnptr) {
+    cudaDriverEntryPointQueryResult status;
+    dh::safe_cuda(cudaGetDriverEntryPoint(name.c_str(), reinterpret_cast<void **>(fnptr),
+                                          cudaEnablePerThreadDefaultStream, &status));
+    CHECK(status == cudaDriverEntryPointSuccess) << name;
+    CHECK(*fnptr);
+  };
+
+  safe_load("cuMemGetAllocationGranularity", &this->cuMemGetAllocationGranularity);
+  safe_load("cuMemCreate", &this->cuMemCreate);
+  safe_load("cuMemMap", &this->cuMemMap);
+  safe_load("cuMemAddressReserve", &this->cuMemAddressReserve);
+  safe_load("cuMemSetAccess", &this->cuMemSetAccess);
+  safe_load("cuMemUnmap", &this->cuMemUnmap);
+  safe_load("cuMemRelease", &this->cuMemRelease);
+  safe_load("cuMemAddressFree", &this->cuMemAddressFree);
+  safe_load("cuGetErrorString", &this->cuGetErrorString);
+  safe_load("cuGetErrorName", &this->cuGetErrorName);
+  safe_load("cuDeviceGetAttribute", &this->cuDeviceGetAttribute);
+  safe_load("cuDeviceGet", &this->cuDeviceGet);
+
+  CHECK(this->cuMemGetAllocationGranularity);
+}
+
+void CuDriverApi::ThrowIfError(CUresult status, StringView fn, std::int32_t line,
+                               char const *file) const {
+  if (status == CUDA_SUCCESS) {
+    return;
+  }
+  std::string cuerr{"CUDA driver error:"};
+
+  char const *name{nullptr};
+  auto err0 = this->cuGetErrorName(status, &name);
+  if (err0 != CUDA_SUCCESS) {
+    LOG(WARNING) << cuerr << status << ". Then we failed to get error name:" << err0;
+  }
+  char const *msg{nullptr};
+  auto err1 = this->cuGetErrorString(status, &msg);
+  if (err1 != CUDA_SUCCESS) {
+    LOG(WARNING) << cuerr << status << ". Then we failed to get error string:" << err1;
+  }
+
+  std::stringstream ss;
+  ss << fn << "[" << file << ":" << line << "]:";
+  if (name != nullptr && err0 == CUDA_SUCCESS) {
+    ss << cuerr << " " << name << ".";
+  }
+  if (msg != nullptr && err1 == CUDA_SUCCESS) {
+    ss << " " << msg << "\n";
+  }
+  LOG(FATAL) << ss.str();
+}
+
+[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi() {
+  static std::once_flag flag;
+  static std::unique_ptr<CuDriverApi> cu;
+  std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(); });
+  return *cu;
+}
+
+void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
+  auto ordinal = curt::CurrentDevice();
+  loc->type = type;
+
+  if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
+    loc->id = ordinal;
+  } else {
+    std::int32_t numa_id = -1;
+    CUdevice device;
+    safe_cu(GetGlobalCuDriverApi().cuDeviceGet(&device, ordinal));
+    safe_cu(GetGlobalCuDriverApi().cuDeviceGetAttribute(&numa_id, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID,
+                                                        device));
+    numa_id = std::max(numa_id, 0);
+
+    loc->id = numa_id;
+  }
+}
+
+[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type) {
+  CUmemAllocationProp prop;
+  std::memset(&prop, '\0', sizeof(prop));
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  MakeCuMemLocation(type, &prop.location);
+  return prop;
+}
+}  // namespace xgboost::cudr
+#endif
--- a/src/common/cuda_dr_utils.h
+++ b/src/common/cuda_dr_utils.h
@ -0,0 +1,105 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ *
+ * @brief Utility for CUDA driver API.
+ *
+ * XGBoost doesn't link libcuda.so at build time. The utilities here load the shared
+ * object at runtime.
+ */
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <cstdint>  // for int32_t
+
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::cudr {
+/**
+ * @brief A struct for retrieving CUDA driver API from the runtime API.
+ */
+struct CuDriverApi {
+  using Flags = unsigned long long;  // NOLINT
+
+  // Memroy manipulation functions.
+  using MemGetAllocationGranularityFn = CUresult(size_t *granularity,
+                                                 const CUmemAllocationProp *prop,
+                                                 CUmemAllocationGranularity_flags option);
+  using MemCreateFn = CUresult(CUmemGenericAllocationHandle *handle, size_t size,
+                               const CUmemAllocationProp *prop, Flags flags);
+  using MemMapFn = CUresult(CUdeviceptr ptr, size_t size, size_t offset,
+                            CUmemGenericAllocationHandle handle, Flags flags);
+  using MemAddressReserveFn = CUresult(CUdeviceptr *ptr, size_t size, size_t alignment,
+                                       CUdeviceptr addr, Flags flags);
+  using MemSetAccessFn = CUresult(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc,
+                                  size_t count);
+  using MemUnmapFn = CUresult(CUdeviceptr ptr, size_t size);
+  using MemReleaseFn = CUresult(CUmemGenericAllocationHandle handle);
+  using MemAddressFreeFn = CUresult(CUdeviceptr ptr, size_t size);
+  // Error handling
+  using GetErrorString = CUresult(CUresult error, const char **pStr);
+  using GetErrorName = CUresult(CUresult error, const char **pStr);
+  // Device attributes
+  using DeviceGetAttribute = CUresult(int *pi, CUdevice_attribute attrib, CUdevice dev);
+  using DeviceGet = CUresult(CUdevice *device, int ordinal);
+
+  MemGetAllocationGranularityFn *cuMemGetAllocationGranularity{nullptr};  // NOLINT
+  MemCreateFn *cuMemCreate{nullptr};                                      // NOLINT
+  /**
+   * @param[in] offset - Must be zero.
+   */
+  MemMapFn *cuMemMap{nullptr};                                            // NOLINT
+  /**
+   * @param[out] ptr       - Resulting pointer to start of virtual address range allocated
+   * @param[in]  size      - Size of the reserved virtual address range requested
+   * @param[in]  alignment - Alignment of the reserved virtual address range requested
+   * @param[in]  addr      - Fixed starting address range requested
+   * @param[in]  flags     - Currently unused, must be zero
+   */
+  MemAddressReserveFn *cuMemAddressReserve{nullptr};  // NOLINT
+  MemSetAccessFn *cuMemSetAccess{nullptr};            // NOLINT
+  MemUnmapFn *cuMemUnmap{nullptr};                    // NOLINT
+  MemReleaseFn *cuMemRelease{nullptr};                // NOLINT
+  MemAddressFreeFn *cuMemAddressFree{nullptr};        // NOLINT
+  GetErrorString *cuGetErrorString{nullptr};          // NOLINT
+  GetErrorName *cuGetErrorName{nullptr};              // NOLINT
+  DeviceGetAttribute *cuDeviceGetAttribute{nullptr};  // NOLINT
+  DeviceGet *cuDeviceGet{nullptr};                    // NOLINT
+
+  CuDriverApi();
+
+  void ThrowIfError(CUresult status, StringView fn, std::int32_t line, char const *file) const;
+};
+
+[[nodiscard]] CuDriverApi &GetGlobalCuDriverApi();
+
+/**
+ * @brief Macro for guarding CUDA driver API calls.
+ */
+#define safe_cu(call)                                                                            \
+  do {                                                                                           \
+    auto __status = (call);                                                                      \
+    if (__status != CUDA_SUCCESS) {                                                              \
+      ::xgboost::cudr::GetGlobalCuDriverApi().ThrowIfError(__status, #call, __LINE__, __FILE__); \
+    }                                                                                            \
+  } while (0)
+
+// Get the allocation granularity.
+inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
+  std::size_t granularity;
+  safe_cu(GetGlobalCuDriverApi().cuMemGetAllocationGranularity(
+      &granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  return granularity;
+}
+
+/**
+ * @brief Obtain appropriate device ordinal for `CUmemLocation`.
+ */
+void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
+
+/**
+ * @brief Construct a `CUmemAllocationProp`.
+ */
+[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
+}  // namespace xgboost::cudr
--- a/src/common/cuda_rt_utils.cc
+++ b/src/common/cuda_rt_utils.cc
@ -8,10 +8,11 @@
 #endif  // defined(XGBOOST_USE_CUDA)

 #include <cstdint>  // for int32_t
+#include <mutex>    // for once_flag, call_once

 #include "common.h"  // for safe_cuda

-namespace xgboost::common {
+namespace xgboost::curt {
 #if defined(XGBOOST_USE_CUDA)
 std::int32_t AllVisibleGPUs() {
  int n_visgpus = 0;
@ -19,7 +20,7 @@ std::int32_t AllVisibleGPUs() {
    // When compiled with CUDA but running on CPU only device,
    // cudaGetDeviceCount will fail.
    dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-  } catch (const dmlc::Error &) {
+  } catch (const dmlc::Error&) {
    cudaGetLastError();  // reset error.
    return 0;
  }
@ -63,11 +64,36 @@ void SetDevice(std::int32_t device) {
    dh::safe_cuda(cudaSetDevice(device));
  }
 }
+
+namespace {
+template <typename Fn>
+void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) {
+  static std::int32_t version = 0;
+  static std::once_flag flag;
+  std::call_once(flag, [&] { fn(&version); });
+  if (major) {
+    *major = version / 1000;
+  }
+  if (minor) {
+    *minor = version % 100 / 10;
+  }
+}
+}  // namespace
+
+void RtVersion(std::int32_t* major, std::int32_t* minor) {
+  GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaRuntimeGetVersion(ver)); }, major,
+                 minor);
+}
+
+void DrVersion(std::int32_t* major, std::int32_t* minor) {
+  GetVersionImpl([](std::int32_t* ver) { dh::safe_cuda(cudaDriverGetVersion(ver)); }, major, minor);
+}
+
 #else
 std::int32_t AllVisibleGPUs() { return 0; }

 std::int32_t CurrentDevice() {
-  AssertGPUSupport();
+  common::AssertGPUSupport();
  return -1;
 }

@ -79,8 +105,8 @@ void CheckComputeCapability() {}

 void SetDevice(std::int32_t device) {
  if (device >= 0) {
-    AssertGPUSupport();
+    common::AssertGPUSupport();
  }
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-}  // namespace xgboost::common
+}  // namespace xgboost::curt
--- a/src/common/cuda_rt_utils.h
+++ b/src/common/cuda_rt_utils.h
@ -8,7 +8,7 @@
 #include <nvtx3/nvtx3.hpp>
 #endif  // defined(XGBOOST_USE_NVTX)

-namespace xgboost::common {
+namespace xgboost::curt {
 std::int32_t AllVisibleGPUs();

 std::int32_t CurrentDevice();
@ -24,6 +24,12 @@ void CheckComputeCapability();

 void SetDevice(std::int32_t device);

+// Returns the CUDA Runtime version.
+void RtVersion(std::int32_t* major, std::int32_t* minor);
+
+// Returns the latest version of CUDA supported by the driver.
+void DrVersion(std::int32_t* major, std::int32_t* minor);
+
 struct NvtxDomain {
  static constexpr char const *name{"libxgboost"};  // NOLINT
 };
@ -49,10 +55,10 @@ class NvtxRgb {
  explicit NvtxRgb(Args &&...) {}
 };
 #endif  // defined(XGBOOST_USE_NVTX)
-}  // namespace xgboost::common
+}  // namespace xgboost::curt

 #if defined(XGBOOST_USE_NVTX)
-#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
+#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::curt::NvtxDomain)
 #else
 #define xgboost_NVTX_FN_RANGE()
 #endif  // defined(XGBOOST_USE_NVTX)
--- a/src/common/device_helpers.cu
+++ b/src/common/device_helpers.cu
@ -0,0 +1,23 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include "cuda_rt_utils.h"  // for RtVersion
+#include "device_helpers.cuh"
+#include "xgboost/windefs.h"  // for xgboost_IS_WIN
+
+namespace dh {
+PinnedMemory::PinnedMemory() {
+#if defined(xgboost_IS_WIN)
+  this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
+#else
+  std::int32_t major{0}, minor{0};
+  xgboost::curt::DrVersion(&major, &minor);
+  // Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable.
+  if (major >= 12 && minor >= 5) {
+    this->impl_.emplace<detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_HOST_NUMA);
+  } else {
+    this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
+  }
+#endif
+}
+}  // namespace dh
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -16,7 +16,8 @@
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_type.cuh>  // for UnitWord, DoubleBuffer
-#include <vector>
+#include <variant>            // for variant, visit
+#include <vector>             // for vector

 #include "common.h"
 #include "device_vector.cuh"
@ -372,36 +373,25 @@ void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<const T>
 }

 // Keep track of pinned memory allocation
-struct PinnedMemory {
-  void *temp_storage{nullptr};
-  size_t temp_storage_bytes{0};
+class PinnedMemory {
+  std::variant<detail::GrowOnlyPinnedMemoryImpl, detail::GrowOnlyVirtualMemVec> impl_;

-  ~PinnedMemory() { Free(); }
+ public:
+  PinnedMemory();

  template <typename T>
  xgboost::common::Span<T> GetSpan(size_t size) {
-    size_t num_bytes = size * sizeof(T);
-    if (num_bytes > temp_storage_bytes) {
-      Free();
-      safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
-      temp_storage_bytes = num_bytes;
+    return std::visit([&](auto &&alloc) { return alloc.template GetSpan<T>(size); }, this->impl_);
  }
-    return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
-  }
-
  template <typename T>
-  xgboost::common::Span<T> GetSpan(size_t size, T init) {
+  xgboost::common::Span<T> GetSpan(size_t size, T const &init) {
    auto result = this->GetSpan<T>(size);
-    for (auto &e : result) {
-      e = init;
-    }
+    std::fill_n(result.data(), result.size(), init);
    return result;
  }
-
-  void Free() {
-    if (temp_storage != nullptr) {
-      safe_cuda(cudaFreeHost(temp_storage));
-    }
+  // Used for testing.
+  [[nodiscard]] bool IsVm() {
+    return std::get_if<detail::GrowOnlyVirtualMemVec>(&this->impl_) != nullptr;
  }
 };

--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@ -1,10 +1,14 @@
 /**
 * Copyright 2017-2024, XGBoost contributors
 */
+#include <numeric>  // for accumulate
+
 #include "../collective/communicator-inl.h"  // for GetRank
 #include "common.h"                          // for HumanMemUnit
+#include "cuda_dr_utils.h"
 #include "device_helpers.cuh"  // for CurrentDevice
 #include "device_vector.cuh"
+#include "transform_iterator.h"  // for MakeIndexTransformIter

 namespace dh {
 namespace detail {
@ -18,6 +22,79 @@ void ThrowOOMError(std::string const &err, std::size_t bytes) {
     << "- Requested memory: " << HumanMemUnit(bytes) << std::endl;
  LOG(FATAL) << ss.str();
 }
+
+[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::PhyCapacity() const {
+  auto it = xgboost::common::MakeIndexTransformIter(
+      [&](std::size_t i) { return this->handles_[i]->size; });
+  return std::accumulate(it, it + this->handles_.size(), static_cast<std::size_t>(0));
+}
+
+void GrowOnlyVirtualMemVec::Reserve(std::size_t new_size) {
+  auto va_capacity = this->Capacity();
+  if (new_size < va_capacity) {
+    return;
+  }
+
+  // Try to reserve new virtual address.
+  auto const aligned_size = RoundUp(new_size, this->granularity_);
+  auto const new_reserve_size = aligned_size - va_capacity;
+  CUresult status = CUDA_SUCCESS;
+  auto hint = this->DevPtr() + va_capacity;
+
+  bool failed{false};
+  auto range = std::make_unique<VaRange>(new_reserve_size, hint, &status, &failed);
+  if (failed) {
+    // Failed to reserve the requested address.
+    // Slow path, try to reserve a new address with full size.
+    range = std::make_unique<VaRange>(aligned_size, 0ULL, &status, &failed);
+    safe_cu(status);
+    CHECK(!failed);
+
+    // New allocation is successful. Map the pyhsical address to the virtual address.
+    // First unmap the existing ptr.
+    if (this->DevPtr() != 0) {
+      // Unmap the existing ptr.
+      safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity()));
+
+      // Then remap all the existing physical addresses to the new ptr.
+      CUdeviceptr ptr = range->DevPtr();
+      for (auto const &hdl : this->handles_) {
+        this->MapBlock(ptr, hdl);
+        ptr += hdl->size;
+      }
+
+      // Release the existing ptr.
+      va_ranges_.clear();
+    }
+  }
+
+  va_ranges_.emplace_back(std::move(range));
+}
+
+GrowOnlyVirtualMemVec::GrowOnlyVirtualMemVec(CUmemLocationType type)
+    : prop_{xgboost::cudr::MakeAllocProp(type)},
+      granularity_{xgboost::cudr::GetAllocGranularity(&this->prop_)} {
+  CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA);
+  // Assign the access descriptor
+  CUmemAccessDesc dacc;
+  dacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  xgboost::cudr::MakeCuMemLocation(CU_MEM_LOCATION_TYPE_DEVICE, &dacc.location);
+  this->access_desc_.push_back(dacc);
+
+  if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) {
+    CUmemAccessDesc hacc;
+    hacc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+    xgboost::cudr::MakeCuMemLocation(type, &hacc.location);
+    this->access_desc_.push_back(hacc);
+  }
+}
+
+[[nodiscard]] std::size_t GrowOnlyVirtualMemVec::Capacity() const {
+  auto it = xgboost::common::MakeIndexTransformIter(
+      [&](std::size_t i) { return this->va_ranges_[i]->Size(); });
+  return std::accumulate(it, it + this->va_ranges_.size(), static_cast<std::size_t>(0));
+}
 }  // namespace detail

 #if defined(XGBOOST_USE_RMM)
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@ -25,6 +25,8 @@

 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1

+#include <cuda.h>
+
 #include <cstddef>                 // for size_t
 #include <cub/util_allocator.cuh>  // for CachingDeviceAllocator
 #include <cub/util_device.cuh>     // for CurrentDevice
@ -33,7 +35,9 @@
 #include <mutex>                   // for defer_lock

 #include "common.h"         // for safe_cuda, HumanMemUnit
+#include "cuda_dr_utils.h"  // for CuDriverApi
 #include "xgboost/logging.h"
+#include "xgboost/span.h"  // for Span

 namespace dh {
 namespace detail {
@ -127,6 +131,153 @@ class MemoryLogger {
 };

 void ThrowOOMError(std::string const &err, std::size_t bytes);
+
+struct GrowOnlyPinnedMemoryImpl {
+  void *temp_storage{nullptr};
+  size_t temp_storage_bytes{0};
+
+  ~GrowOnlyPinnedMemoryImpl() { Free(); }
+
+  template <typename T>
+  xgboost::common::Span<T> GetSpan(size_t size) {
+    size_t num_bytes = size * sizeof(T);
+    if (num_bytes > temp_storage_bytes) {
+      Free();
+      safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
+      temp_storage_bytes = num_bytes;
+    }
+    return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
+  }
+
+  void Free() {
+    if (temp_storage != nullptr) {
+      safe_cuda(cudaFreeHost(temp_storage));
+    }
+  }
+};
+
+/**
+ * @brief Use low-level virtual memory functions from CUDA driver API for grow-only memory
+ *        allocation.
+ *
+ * @url https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/
+ *
+ * Aside from the potential performance benefits, this is primarily implemented to prevent
+ * deadlock in NCCL and XGBoost. The host NUMA version requires CTK12.5+ to be stable.
+ */
+class GrowOnlyVirtualMemVec {
+  static auto RoundUp(std::size_t new_sz, std::size_t chunk_sz) {
+    return ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz;
+  }
+
+  struct PhyAddrHandle {
+    CUmemGenericAllocationHandle handle;
+    std::size_t size;
+  };
+
+  class VaRange {
+    CUdeviceptr ptr_{0};
+    std::size_t size_{0};
+
+   public:
+    VaRange(std::size_t size, CUdeviceptr hint, CUresult *p_status, bool *failed) : size_{size} {
+      CUresult &status = *p_status;
+      status = xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressReserve(&ptr_, size, 0, hint, 0);
+      *failed = status != CUDA_SUCCESS || (hint != 0 && ptr_ != hint);
+    }
+    ~VaRange() {
+      if (ptr_ != 0) {
+        xgboost::cudr::GetGlobalCuDriverApi().cuMemAddressFree(ptr_, this->size_);
+      }
+    }
+
+    VaRange(VaRange const &that) = delete;
+    VaRange &operator=(VaRange const &that) = delete;
+
+    VaRange(VaRange &&that) { std::swap(*this, that); }
+    VaRange &operator=(VaRange &&that) {
+      std::swap(*this, that);
+      return *this;
+    }
+    [[nodiscard]] auto DevPtr() const { return this->ptr_; }
+    [[nodiscard]] std::size_t Size() const { return this->size_; }
+  };
+
+  using PhyHandle = std::unique_ptr<PhyAddrHandle, std::function<void(PhyAddrHandle *)>>;
+  std::vector<PhyHandle> handles_;
+  std::vector<std::unique_ptr<VaRange>> va_ranges_;
+
+  xgboost::cudr::CuDriverApi &cu_{xgboost::cudr::GetGlobalCuDriverApi()};
+  std::vector<CUmemAccessDesc> access_desc_;
+  CUmemAllocationProp const prop_;
+
+  // Always use bytes.
+  std::size_t const granularity_;
+
+  [[nodiscard]] std::size_t PhyCapacity() const;
+  [[nodiscard]] CUdeviceptr DevPtr() const {
+    if (this->va_ranges_.empty()) {
+      return 0;
+    }
+    return this->va_ranges_.front()->DevPtr();
+  }
+  void MapBlock(CUdeviceptr ptr, PhyHandle const &hdl) const {
+    safe_cu(cu_.cuMemMap(ptr, hdl->size, 0, hdl->handle, 0));
+    safe_cu(cu_.cuMemSetAccess(ptr, hdl->size, access_desc_.data(), access_desc_.size()));
+  }
+  auto CreatePhysicalMem(std::size_t size) const {
+    CUmemGenericAllocationHandle alloc_handle;
+    auto padded_size = RoundUp(size, this->granularity_);
+    CUresult status = this->cu_.cuMemCreate(&alloc_handle, padded_size, &this->prop_, 0);
+    CHECK_EQ(status, CUDA_SUCCESS);
+    return alloc_handle;
+  }
+  void Reserve(std::size_t new_size);
+
+ public:
+  explicit GrowOnlyVirtualMemVec(CUmemLocationType type);
+
+  void GrowTo(std::size_t n_bytes) {
+    auto alloc_size = this->PhyCapacity();
+    if (n_bytes <= alloc_size) {
+      return;
+    }
+
+    std::size_t delta = n_bytes - alloc_size;
+    auto const padded_delta = RoundUp(delta, this->granularity_);
+    this->Reserve(alloc_size + padded_delta);
+
+    this->handles_.emplace_back(
+        std::unique_ptr<PhyAddrHandle, std::function<void(PhyAddrHandle *)>>{
+            new PhyAddrHandle{this->CreatePhysicalMem(padded_delta), padded_delta}, [&](auto *hdl) {
+              if (hdl) {
+                cu_.cuMemRelease(hdl->handle);
+              }
+            }});
+    auto ptr = this->DevPtr() + alloc_size;
+    this->MapBlock(ptr, this->handles_.back());
+  }
+
+  template <typename T>
+  xgboost::common::Span<T> GetSpan(std::size_t size) {
+    size_t n_bytes = size * sizeof(T);
+    this->GrowTo(n_bytes);
+    return xgboost::common::Span<T>(reinterpret_cast<T *>(this->DevPtr()), size);
+  }
+
+  ~GrowOnlyVirtualMemVec() noexcept(false) {
+    if (this->DevPtr() != 0) {
+      safe_cu(cu_.cuMemUnmap(this->DevPtr(), this->PhyCapacity()));
+    }
+
+    this->va_ranges_.clear();  // make sure all VA are freed before releasing the handles.
+    this->handles_.clear();    // release the handles
+  }
+
+  [[nodiscard]] void *data() { return reinterpret_cast<void *>(this->DevPtr()); }  // NOLINT
+  [[nodiscard]] std::size_t size() const { return this->PhyCapacity(); }           // NOLINT
+  [[nodiscard]] std::size_t Capacity() const;
+};
 }  // namespace detail

 inline detail::MemoryLogger &GlobalMemoryLogger() {
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@ -337,7 +337,7 @@ void ProcessWeightedSlidingWindow(Context const* ctx, Batch batch, MetaInfo cons
                                  int num_cuts_per_feature, bool is_ranking, float missing,
                                  size_t columns, size_t begin, size_t end,
                                  SketchContainer* sketch_container) {
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  info.weights_.SetDevice(ctx->Device());
  auto weights = info.weights_.ConstDeviceSpan();

--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@ -309,7 +309,7 @@ void MergeImpl(Context const *ctx, Span<SketchEntry const> const &d_x,

 void SketchContainer::Push(Context const *ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
                           common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights) {
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  Span<SketchEntry> out;
  dh::device_vector<SketchEntry> cuts;
  bool first_window = this->Current().empty();
@ -369,7 +369,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,
   * pruning or merging. We preserve the first type and remove the second type.
   */
  timer_.Start(__func__);
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);

  auto key_it = dh::MakeTransformIterator<size_t>(
@ -408,7 +408,7 @@ size_t SketchContainer::ScanInput(Context const *ctx, Span<SketchEntry> entries,

 void SketchContainer::Prune(Context const* ctx, std::size_t to) {
  timer_.Start(__func__);
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());

  OffsetT to_total = 0;
  auto& h_columns_ptr = columns_ptr_b_.HostVector();
@ -443,7 +443,7 @@ void SketchContainer::Prune(Context const* ctx, std::size_t to) {

 void SketchContainer::Merge(Context const *ctx, Span<OffsetT const> d_that_columns_ptr,
                            Span<SketchEntry const> that) {
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  auto self = dh::ToSpan(this->Current());
  LOG(DEBUG) << "Merge: self:" << HumanMemUnit(self.size_bytes()) << ". "
             << "That:" << HumanMemUnit(that.size_bytes()) << ". "
@ -507,7 +507,7 @@ void SketchContainer::FixError() {
 }

 void SketchContainer::AllReduce(Context const* ctx, bool is_column_split) {
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  auto world = collective::GetWorldSize();
  if (world == 1 || is_column_split) {
    return;
@ -596,7 +596,7 @@ struct InvalidCatOp {

 void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
  timer_.Start(__func__);
-  SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  p_cuts->min_vals_.Resize(num_columns_);

  // Sync between workers.
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@ -206,7 +206,7 @@ class SketchContainer {
  template <typename KeyComp = thrust::equal_to<size_t>>
  size_t Unique(Context const* ctx, KeyComp key_comp = thrust::equal_to<size_t>{}) {
    timer_.Start(__func__);
-    SetDevice(ctx->Ordinal());
+    curt::SetDevice(ctx->Ordinal());
    this->columns_ptr_.SetDevice(ctx->Device());
    Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
    CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@ -13,6 +13,8 @@

 #if defined(__linux__)
 #include <pthread.h>
+#include <sys/syscall.h>  // for SYS_getcpu
+#include <unistd.h>       // for syscall
 #endif

 namespace xgboost::common {
@ -118,6 +120,14 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
  return n_threads;
 }

+[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) {
+#ifdef SYS_getcpu
+  return syscall(SYS_getcpu, cpu, numa, NULL) == 0;
+#else
+  return false;
+#endif
+}
+
 void NameThread(std::thread* t, StringView name) {
 #if defined(__linux__)
  auto handle = t->native_handle();
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@ -306,10 +306,16 @@ class MemStackAllocator {
 };

 /**
- * \brief Constant that can be used for initializing static thread local memory.
+ * @brief Constant that can be used for initializing static thread local memory.
 */
 std::int32_t constexpr DefaultMaxThreads() { return 128; }

+/**
+ * @brief Get numa node on Linux. Other platforms are not supported. Returns false if the
+ *        call fails.
+ */
+[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa);
+
 /**
 * @brief Give the thread a name. Supports only pthread on linux.
 */
--- a/src/common/timer.cc
+++ b/src/common/timer.cc
@ -18,7 +18,7 @@ void Monitor::Start(std::string const &name) {
    auto &stats = statistics_map_[name];
    stats.timer.Start();
 #if defined(XGBOOST_USE_NVTX)
-    auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
+    auto range_handle = nvtx3::start_range_in<curt::NvtxDomain>(label_ + "::" + name);
    stats.nvtx_id = range_handle.get_value();
 #endif  // defined(XGBOOST_USE_NVTX)
  }
@ -30,7 +30,7 @@ void Monitor::Stop(const std::string &name) {
    stats.timer.Stop();
    stats.count++;
 #if defined(XGBOOST_USE_NVTX)
-    nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
+    nvtx3::end_range_in<curt::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
 #endif  // defined(XGBOOST_USE_NVTX)
  }
 }
--- a/src/context.cc
+++ b/src/context.cc
@ -38,7 +38,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 [[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
  // When booster is loaded from a memory image (Python pickle or R raw model), number of
  // available GPUs could be different.  Wrap around it.
-  std::int32_t n_visible = common::AllVisibleGPUs();
+  std::int32_t n_visible = curt::AllVisibleGPUs();
  if (n_visible == 0) {
    if (device.IsCUDA()) {
      LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
@ -55,7 +55,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
  }

  if (device.IsCUDA()) {
-    common::SetDevice(device.ordinal);
+    curt::SetDevice(device.ordinal);
  }
  return device;
 }
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@ -139,7 +139,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
      n_rows{n_rows},
      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
  monitor_.Init("ellpack_page");
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());

  this->InitCompressedData(ctx);
 }
@ -154,7 +154,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
      row_stride{row_stride},
      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
  monitor_.Init("ellpack_page");
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());

  this->InitCompressedData(ctx);
  this->CreateHistIndices(ctx, page, feature_types);
@ -173,7 +173,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
                      common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
  monitor_.Init("ellpack_page");
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());

  this->InitCompressedData(ctx);

@ -319,7 +319,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m
                                 bst_idx_t n_rows,
                                 std::shared_ptr<common::HistogramCuts const> cuts)
    : EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} {
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());

  if (this->IsDense()) {
    CopyDataToEllpack<true>(ctx, batch, feature_types, this, missing);
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@ -85,7 +85,7 @@ template <typename T>
  bytes += fo->Write(impl->is_dense);
  bytes += fo->Write(impl->row_stride);
  std::vector<common::CompressedByteT> h_gidx_buffer;
-  Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
+  Context ctx = Context{}.MakeCUDA(curt::CurrentDevice());
  [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
  bytes += common::WriteVec(fo, h_gidx_buffer);
  bytes += fo->Write(impl->base_rowid);
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@ -202,7 +202,7 @@ EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>::CreateReader(StringVi
 */
 template <typename F>
 void EllpackPageSourceImpl<F>::Fetch() {
-  common::SetDevice(this->Device().ordinal);
+  curt::SetDevice(this->Device().ordinal);
  if (!this->ReadCache()) {
    if (this->count_ != 0 && !this->sync_) {
      // source is initialized to be the 0th page during construction, so when count_ is 0
@ -236,7 +236,7 @@ EllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>
 */
 template <typename F>
 void ExtEllpackPageSourceImpl<F>::Fetch() {
-  common::SetDevice(this->Device().ordinal);
+  curt::SetDevice(this->Device().ordinal);
  if (!this->ReadCache()) {
    auto iter = this->source_->Iter();
    CHECK_EQ(this->count_, iter);
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@ -61,7 +61,7 @@ template <typename S>
 class EllpackFormatPolicy {
  std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
  DeviceOrd device_;
-  bool has_hmm_{common::SupportsPageableMem()};
+  bool has_hmm_{curt::SupportsPageableMem()};

 public:
  using FormatT = EllpackPageRawFormat;
@ -71,7 +71,7 @@ class EllpackFormatPolicy {
    StringView msg{" The overhead of iterating through external memory might be significant."};
    if (!has_hmm_) {
      LOG(WARNING) << "CUDA heterogeneous memory management is not available." << msg;
-    } else if (!common::SupportsAts()) {
+    } else if (!curt::SupportsAts()) {
      LOG(WARNING) << "CUDA address translation service is not available." << msg;
    }
 #if !defined(XGBOOST_USE_RMM)
@ -121,7 +121,7 @@ class EllpackCacheStreamPolicy : public F<S> {

 template <typename S, template <typename> typename F>
 class EllpackMmapStreamPolicy : public F<S> {
-  bool has_hmm_{common::SupportsPageableMem()};
+  bool has_hmm_{curt::SupportsPageableMem()};

 public:
  using WriterT = common::AlignedFileWriteStream;
--- a/src/data/quantile_dmatrix.cu
+++ b/src/data/quantile_dmatrix.cu
@ -64,8 +64,8 @@ void MakeSketches(Context const* ctx,
     * Get the data shape.
     */
    // We use do while here as the first batch is fetched in ctor
-    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
-    common::SetDevice(dh::GetDevice(ctx).ordinal);
+    CHECK_LT(ctx->Ordinal(), curt::AllVisibleGPUs());
+    curt::SetDevice(dh::GetDevice(ctx).ordinal);
    if (ext_info.n_features == 0) {
      ext_info.n_features = data::BatchColumns(proxy);
      auto rc = collective::Allreduce(ctx, linalg::MakeVec(&ext_info.n_features, 1),
@ -124,7 +124,7 @@ void MakeSketches(Context const* ctx,
                   ext_info.base_rows.begin());

  // Get reference
-  common::SetDevice(dh::GetDevice(ctx).ordinal);
+  curt::SetDevice(dh::GetDevice(ctx).ordinal);
  if (!ref) {
    HostDeviceVector<FeatureType> ft;
    common::SketchContainer final_sketch(
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@ -37,7 +37,7 @@ struct GBLinearTrainParam : public XGBoostParameter<GBLinearTrainParam> {
  size_t max_row_perbatch;

  void CheckGPUSupport() {
-    auto n_gpus = common::AllVisibleGPUs();
+    auto n_gpus = curt::AllVisibleGPUs();
    if (n_gpus == 0 && this->updater == "gpu_coord_descent") {
      common::AssertGPUSupport();
      this->UpdateAllowUnknown(Args{{"updater", "coord_descent"}});
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -105,7 +105,7 @@ void GBTree::Configure(Args const& cfg) {
  }
  cpu_predictor_->Configure(cfg);
 #if defined(XGBOOST_USE_CUDA)
-  auto n_gpus = common::AllVisibleGPUs();
+  auto n_gpus = curt::AllVisibleGPUs();
  if (!gpu_predictor_) {
    gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
  }
@ -344,7 +344,7 @@ void GBTree::LoadConfig(Json const& in) {
  // This would cause all trees to be pushed to trees_to_update
  // e.g. updating a model, then saving and loading it would result in an empty model
  tparam_.process_type = TreeProcessType::kDefault;
-  std::int32_t const n_gpus = common::AllVisibleGPUs();
+  std::int32_t const n_gpus = curt::AllVisibleGPUs();

  auto msg = StringView{
      R"(
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -482,7 +482,7 @@ void ExtractPaths(Context const* ctx,
                  dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>* paths,
                  DeviceModel* model, dh::device_vector<uint32_t>* path_categories,
                  DeviceOrd device) {
-  common::SetDevice(device.ordinal);
+  curt::SetDevice(device.ordinal);
  auto& device_model = *model;

  dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@ -937,7 +937,7 @@ class GPUPredictor : public xgboost::Predictor {
      : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}

  ~GPUPredictor() override {
-    if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
+    if (ctx_->IsCUDA() && ctx_->Ordinal() < curt::AllVisibleGPUs()) {
      dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
    }
  }
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -184,7 +184,7 @@ struct GPUHistMakerDevice {
  // Reset values for each update iteration
  [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
    this->monitor.Start(__func__);
-    common::SetDevice(ctx_->Ordinal());
+    curt::SetDevice(ctx_->Ordinal());

    auto const& info = p_fmat->Info();

@ -789,7 +789,7 @@ class GPUHistMaker : public TreeUpdater {
    // Used in test to count how many configurations are performed
    LOG(DEBUG) << "[GPU Hist]: Configure";
    hist_maker_param_.UpdateAllowUnknown(args);
-    common::CheckComputeCapability();
+    curt::CheckComputeCapability();
    initialised_ = false;

    monitor_.Init("updater_gpu_hist");
@ -835,7 +835,7 @@ class GPUHistMaker : public TreeUpdater {
        ctx_, linalg::MakeVec(&column_sampling_seed, sizeof(column_sampling_seed)), 0));
    this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);

-    common::SetDevice(ctx_->Ordinal());
+    curt::SetDevice(ctx_->Ordinal());
    p_fmat->Info().feature_types.SetDevice(ctx_->Device());

    std::vector<bst_idx_t> batch_ptr;
@ -909,7 +909,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
    // Used in test to count how many configurations are performed
    LOG(DEBUG) << "[GPU Approx]: Configure";
    hist_maker_param_.UpdateAllowUnknown(args);
-    common::CheckComputeCapability();
+    curt::CheckComputeCapability();
    initialised_ = false;

    monitor_.Init(this->Name());
--- a/tests/cpp/collective/test_allgather.cu
+++ b/tests/cpp/collective/test_allgather.cu
@ -94,7 +94,7 @@ class MGPUAllgatherTest : public SocketTest {};
 }  // namespace

 TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
@ -105,7 +105,7 @@ TEST_F(MGPUAllgatherTest, MGPUTestVRing) {
 }

 TEST_F(MGPUAllgatherTest, MGPUTestVBcast) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
--- a/tests/cpp/collective/test_allreduce.cu
+++ b/tests/cpp/collective/test_allreduce.cu
@ -5,7 +5,7 @@
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>  // for host_vector

-#include "../../../src/common/common.h"            // for AllVisibleGPUs
+#include "../../../src/common/cuda_rt_utils.h"     // for AllVisibleGPUs
 #include "../../../src/common/device_helpers.cuh"  // for ToSpan,  device_vector
 #include "../../../src/common/type.h"              // for EraseType
 #include "test_worker.cuh"                         // for NCCLWorkerForTest
@ -46,7 +46,7 @@ class Worker : public NCCLWorkerForTest {
 }  // namespace

 TEST_F(MGPUAllreduceTest, BitOr) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
@ -56,7 +56,7 @@ TEST_F(MGPUAllreduceTest, BitOr) {
 }

 TEST_F(MGPUAllreduceTest, Sum) {
-  auto n_workers = common::AllVisibleGPUs();
+  auto n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    Worker w{host, port, timeout, n_workers, r};
--- a/tests/cpp/collective/test_comm_group.cc
+++ b/tests/cpp/collective/test_comm_group.cc
@ -37,7 +37,7 @@ TEST_F(CommGroupTest, Basic) {

 #if defined(XGBOOST_USE_NCCL)
 TEST_F(CommGroupTest, BasicGPU) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                 std::int32_t r) {
    auto ctx = MakeCUDACtx(r);
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@ -205,7 +205,7 @@ class BaseMGPUTest : public ::testing::Test {
  template <typename Fn>
  auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
              [[maybe_unused]] bool emulate_if_single = false) const {
-    auto n_gpus = common::AllVisibleGPUs();
+    auto n_gpus = curt::AllVisibleGPUs();
    if (is_federated) {
 #if defined(XGBOOST_USE_FEDERATED)
      if (n_gpus == 1 && emulate_if_single) {
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@ -3,6 +3,11 @@
 */
 #include <gtest/gtest.h>

+#include <numeric>                     // for iota
+#include <thrust/detail/sequence.inl>  // for sequence
+
+#include "../../../src/common/cuda_rt_utils.h"     // for DrVersion
+#include "../../../src/common/device_helpers.cuh"  // for CachingThrustPolicy, PinnedMemory
 #include "../../../src/common/device_vector.cuh"
 #include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore

@ -18,4 +23,96 @@ TEST(DeviceUVector, Basic) {
  ASSERT_EQ(peak, n_bytes);
  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
 }
+
+#if defined(__linux__)
+namespace {
+class TestVirtualMem : public ::testing::TestWithParam<CUmemLocationType> {
+ public:
+  void Run() {
+    auto type = this->GetParam();
+    detail::GrowOnlyVirtualMemVec vec{type};
+    auto prop = xgboost::cudr::MakeAllocProp(type);
+    auto gran = xgboost::cudr::GetAllocGranularity(&prop);
+    ASSERT_GE(gran, 2);
+    auto data = vec.GetSpan<std::int32_t>(32);  // should be smaller than granularity
+    ASSERT_EQ(data.size(), 32);
+    static_assert(std::is_same_v<typename decltype(data)::value_type, std::int32_t>);
+
+    std::vector<std::int32_t> h_data(data.size());
+    auto check = [&] {
+      for (std::size_t i = 0; i < h_data.size(); ++i) {
+        ASSERT_EQ(h_data[i], i);
+      }
+    };
+    auto fill = [&](std::int32_t n_orig, xgboost::common::Span<std::int32_t> data) {
+      if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
+        thrust::sequence(dh::CachingThrustPolicy(), data.data() + n_orig, data.data() + data.size(),
+                         n_orig);
+        dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault));
+      } else {
+        std::iota(data.data() + n_orig, data.data() + data.size(), n_orig);
+        std::copy_n(data.data(), data.size(), h_data.data());
+      }
+    };
+
+    fill(0, data);
+    check();
+
+    auto n_orig = data.size();
+    // Should be smaller than granularity, use already reserved.
+    data = vec.GetSpan<std::int32_t>(128);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    if (128 < gran) {
+      ASSERT_EQ(vec.Capacity(), gran);
+    }
+
+    n_orig = data.size();
+    data = vec.GetSpan<std::int32_t>(gran / 2);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    ASSERT_EQ(vec.Capacity(), gran * 2);
+
+    n_orig = data.size();
+    data = vec.GetSpan<std::int32_t>(gran);
+    h_data.resize(data.size());
+    fill(n_orig, data);
+    check();
+    ASSERT_EQ(vec.Capacity(), gran * 4);
+  }
+};
+}  // anonymous namespace
+
+TEST_P(TestVirtualMem, Alloc) { this->Run(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    Basic, TestVirtualMem,
+    ::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA),
+    [](::testing::TestParamInfo<TestVirtualMem::ParamType> const& info) -> char const* {
+      auto type = info.param;
+      switch (type) {
+        case CU_MEM_LOCATION_TYPE_DEVICE:
+          return "Device";
+        case CU_MEM_LOCATION_TYPE_HOST_NUMA:
+          return "HostNuma";
+        default:
+          LOG(FATAL) << "unreachable";
+      }
+      return nullptr;
+    });
+#endif  // defined(__linux__)
+
+TEST(TestVirtualMem, Version) {
+  std::int32_t major, minor;
+  xgboost::curt::DrVersion(&major, &minor);
+  LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor;
+  PinnedMemory pinned;
+  if (major >= 12 && minor >= 5) {
+    ASSERT_TRUE(pinned.IsVm());
+  } else {
+    ASSERT_FALSE(pinned.IsVm());
+  }
+}
 }  // namespace dh
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@ -578,7 +578,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {

 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-  common::SetDevice(ctx->Ordinal());
+  curt::SetDevice(ctx->Ordinal());
  auto n = n_samples * n_features;
  std::vector<float> x;
  x.resize(n);
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@ -100,7 +100,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
 }

 void TestHostDeviceVector(size_t n, DeviceOrd device) {
-  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);
  HostDeviceVector<int> v;
  InitHostDeviceVector(n, device, &v);
  CheckDevice(&v, n, 0, GPUAccess::kRead);
@ -119,7 +119,7 @@ TEST(HostDeviceVector, Basic) {
 TEST(HostDeviceVector, Copy) {
  size_t n = 1001;
  auto device = DeviceOrd::CUDA(0);
-  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
+  HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(curt::SetDevice);

  HostDeviceVector<int> v;
  {
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@ -72,7 +72,7 @@ TEST_P(TestEllpackPageRawFormat, DiskIO) {
 }

 TEST_P(TestEllpackPageRawFormat, DiskIOHmm) {
-  if (common::SupportsPageableMem()) {
+  if (curt::SupportsPageableMem()) {
    EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy> policy{true};
    this->Run(&policy, this->GetParam());
  } else {
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@ -655,7 +655,7 @@ class RMMAllocator {
  std::vector<std::unique_ptr<CUDAMemoryResource>> cuda_mr;
  std::vector<std::unique_ptr<PoolMemoryResource>> pool_mr;
  int n_gpu;
-  RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
+  RMMAllocator() : n_gpu(curt::AllVisibleGPUs()) {
    int current_device;
    CHECK_EQ(cudaGetDevice(&current_device), cudaSuccess);
    for (int i = 0; i < n_gpu; ++i) {
@ -697,5 +697,5 @@ void DeleteRMMResource(RMMAllocator*) {}
 RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1

-std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
+std::int32_t DistGpuIdx() { return curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
 } // namespace xgboost
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -34,7 +34,7 @@
 #endif

 #if defined(__CUDACC__)
-#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
+#define GPUIDX (curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
 #else
 #define GPUIDX (-1)
 #endif
--- a/tests/cpp/metric/test_distributed_metric.cc
+++ b/tests/cpp/metric/test_distributed_metric.cc
@ -47,7 +47,7 @@ class TestDistributedMetric : public ::testing::TestWithParam<Param> {

    std::int32_t n_workers{0};
    if (device.IsCUDA()) {
-      n_workers = common::AllVisibleGPUs();
+      n_workers = curt::AllVisibleGPUs();
    } else {
      n_workers = std::min(static_cast<std::int32_t>(std::thread::hardware_concurrency()), 3);
    }
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@ -102,14 +102,14 @@ void TestAllgatherV(std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
 }  // namespace

 TEST_F(FederatedCollTestGPU, Allreduce) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestAllreduce(comm, rank, n_workers);
  });
 }

 TEST(FederatedCollGPUGlobal, Allreduce) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGlobal(n_workers, [&] {
    auto r = collective::GetRank();
    auto world = collective::GetWorldSize();
@ -135,14 +135,14 @@ TEST(FederatedCollGPUGlobal, Allreduce) {
 }

 TEST_F(FederatedCollTestGPU, Broadcast) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestBroadcast(comm, rank);
  });
 }

 TEST_F(FederatedCollTestGPU, Allgather) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
    TestAllgather(comm, rank, n_workers);
  });
@ -150,7 +150,7 @@ TEST_F(FederatedCollTestGPU, Allgather) {

 TEST_F(FederatedCollTestGPU, AllgatherV) {
  std::int32_t n_workers = 2;
-  if (common::AllVisibleGPUs() < n_workers) {
+  if (curt::AllVisibleGPUs() < n_workers) {
    GTEST_SKIP_("At least 2 GPUs are required for the test.");
  }
  TestFederated(n_workers, [=](std::shared_ptr<FederatedComm> comm, std::int32_t rank) {
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cc
@ -10,7 +10,7 @@

 namespace xgboost::collective {
 TEST(CommGroup, Federated) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
    Context ctx;
    ASSERT_EQ(comm_group->Rank(), r);
--- a/tests/cpp/plugin/federated/test_federated_comm_group.cu
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.cu
@ -11,7 +11,7 @@

 namespace xgboost::collective {
 TEST(CommGroup, FederatedGPU) {
-  std::int32_t n_workers = common::AllVisibleGPUs();
+  std::int32_t n_workers = curt::AllVisibleGPUs();
  TestFederatedGroup(n_workers, [&](std::shared_ptr<CommGroup> comm_group, std::int32_t r) {
    Context ctx = MakeCUDACtx(0);
    auto const& comm = comm_group->Ctx(&ctx, DeviceOrd::CUDA(0));
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@ -299,7 +299,7 @@ TEST(GPUPredictor, IterationRange) {
 }

 TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
-  TestIterationRangeColumnSplit(common::AllVisibleGPUs(), true);
+  TestIterationRangeColumnSplit(curt::AllVisibleGPUs(), true);
 }

 TEST(GPUPredictor, CategoricalPrediction) {
@ -312,7 +312,7 @@ TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
 }

 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  auto ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  auto ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  TestCategoricalPredictLeaf(&ctx, false);
 }

@ -358,7 +358,7 @@ TEST(GPUPredictor, Sparse) {
 }

 TEST_F(MGPUPredictorTest, SparseColumnSplit) {
-  TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.2);
-  TestSparsePredictionColumnSplit(common::AllVisibleGPUs(), true, 0.8);
+  TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.2);
+  TestSparsePredictionColumnSplit(curt::AllVisibleGPUs(), true, 0.8);
 }
 }  // namespace xgboost::predictor
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@ -320,7 +320,7 @@ void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
  }
  auto learner = LearnerForTest(&ctx, m_train, kIters);
  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
@ -354,7 +354,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
 void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  }
  size_t constexpr kCols = 10;
  PredictionCacheEntry out_predictions;
@ -507,7 +507,7 @@ void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
  auto const rank = collective::GetRank();
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : rank);
  }
  auto n_threads = collective::GetWorkerLocalThreads(world_size);
  ctx.UpdateAllowUnknown(
@ -679,7 +679,7 @@ void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::siz
                                       std::vector<float> const &expected_predt) {
  Context ctx;
  if (use_gpu) {
-    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+    ctx = MakeCUDACtx(curt::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
  }
  auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
  std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@ -30,7 +30,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {

 TEST(Context, DeviceOrdinal) {
  Context ctx;
-  auto n_vis = common::AllVisibleGPUs();
+  auto n_vis = curt::AllVisibleGPUs();
  auto ord = n_vis - 1;

  std::string device = "cuda:" + std::to_string(ord);
@ -82,7 +82,7 @@ TEST(Context, GPUId) {
  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
  TestCUDA(ctx, 0);

-  auto n_vis = common::AllVisibleGPUs();
+  auto n_vis = curt::AllVisibleGPUs();
  auto ord = n_vis - 1;
  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
  TestCUDA(ctx, ord);
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@ -759,7 +759,7 @@ void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args

  auto world_size{3};
  if (use_gpu) {
-    world_size = common::AllVisibleGPUs();
+    world_size = curt::AllVisibleGPUs();
    // Simulate MPU on a single GPU. Federated doesn't use nccl, can run multiple
    // instances on the same GPU.
    if (world_size == 1 && federated) {
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@ -595,7 +595,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
 }  // anonymous namespace

 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
-  if (common::AllVisibleGPUs() > 1) {
+  if (curt::AllVisibleGPUs() > 1) {
    // We can't emulate multiple GPUs with NCCL.
    this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(false); }, false, true);
  }
@ -603,7 +603,7 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
 }

 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
-  if (common::AllVisibleGPUs() > 1) {
+  if (curt::AllVisibleGPUs() > 1) {
    // We can't emulate multiple GPUs with NCCL.
    this->DoTest([] { VerifyColumnSplitEvaluateSingleSplit(true); }, false, true);
  }