Avoid thrust vector initialization. (#10544)

* Avoid thrust vector initialization. - Add a wrapper for rmm device uvector. - Split up the `Resize` method for HDV.
2024-07-11 17:29:27 +08:00
parent 89da9f9741
commit 1ca4bfd20e
13 changed files with 510 additions and 291 deletions
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -135,7 +135,9 @@ class HostDeviceVector {

  void SetDevice(DeviceOrd device) const;

-  void Resize(size_t new_size, T v = T());
+  void Resize(std::size_t new_size);
+  /** @brief Resize and initialize the data if the new size is larger than the old size. */
+  void Resize(std::size_t new_size, T v);

  using value_type = T;  // NOLINT

--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@@ -18,7 +18,7 @@ struct CUDAContext {
   * \brief Caching thrust policy.
   */
  auto CTP() const {
-#if THRUST_MAJOR_VERSION >= 2
+#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
    return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
 #else
    return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1,26 +1,21 @@
 /**
- * Copyright 2017-2023 XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
 */
 #pragma once
-#include <thrust/binary_search.h>  // thrust::upper_bound
-#include <thrust/device_malloc_allocator.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
+#include <thrust/binary_search.h>                       // thrust::upper_bound
+#include <thrust/device_ptr.h>                          // for device_ptr
+#include <thrust/device_vector.h>                       // for device_vector
 #include <thrust/execution_policy.h>                    // thrust::seq
-#include <thrust/gather.h>                              // gather
-#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/discard_iterator.h>           // for discard_iterator
 #include <thrust/iterator/transform_output_iterator.h>  // make_transform_output_iterator
-#include <thrust/logical.h>
-#include <thrust/sequence.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
-#include <thrust/transform_scan.h>
 #include <thrust/unique.h>

 #include <algorithm>
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
-#include <cub/util_allocator.cuh>
+#include <cub/util_type.cuh>  // for UnitWord
 #include <sstream>
 #include <string>
 #include <tuple>
@@ -28,22 +23,14 @@

 #include "../collective/communicator-inl.h"
 #include "common.h"
+#include "device_vector.cuh"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
 #include "xgboost/span.h"

-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include "rmm/mr/device/per_device_resource.hpp"
-#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
-#include "rmm/version_config.hpp"
-
-#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
-#error "Please use RMM version 0.18 or later"
-#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
-#error "Please use RMM version 0.18 or later"
-#endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
-
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#if defined(XGBOOST_USE_RMM)
+#include <rmm/exec_policy.hpp>
+#endif  // defined(XGBOOST_USE_RMM)

 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)

@@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) {
  LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }

-namespace detail {
-/** \brief Keeps track of global device memory allocations. Thread safe.*/
-class MemoryLogger {
-  // Information for a single device
-  struct DeviceStats {
-    size_t currently_allocated_bytes{ 0 };
-    size_t peak_allocated_bytes{ 0 };
-    size_t num_allocations{ 0 };
-    size_t num_deallocations{ 0 };
-    std::map<void *, size_t> device_allocations;
-    void RegisterAllocation(void *ptr, size_t n) {
-      device_allocations[ptr] = n;
-      currently_allocated_bytes += n;
-      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
-      num_allocations++;
-      CHECK_GT(num_allocations, num_deallocations);
-    }
-    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
-      auto itr = device_allocations.find(ptr);
-      if (itr == device_allocations.end()) {
-        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
-                     << " that was never allocated\n"
-                     << dmlc::StackTrace();
-      } else {
-        num_deallocations++;
-        CHECK_LE(num_deallocations, num_allocations);
-        currently_allocated_bytes -= itr->second;
-        device_allocations.erase(itr);
-      }
-    }
-  };
-  DeviceStats stats_;
-  std::mutex mutex_;
-
-public:
-  void RegisterAllocation(void *ptr, size_t n) {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    stats_.RegisterAllocation(ptr, n);
-  }
-  void RegisterDeallocation(void *ptr, size_t n) {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    stats_.RegisterDeallocation(ptr, n, current_device);
-  }
-  size_t PeakMemory() const {
-    return stats_.peak_allocated_bytes;
-  }
-  size_t CurrentlyAllocatedBytes() const {
-    return stats_.currently_allocated_bytes;
-  }
-  void Clear()
-  {
-    stats_ = DeviceStats();
-  }
-
-  void Log() {
-    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      return;
-    }
-    std::lock_guard<std::mutex> guard(mutex_);
-    int current_device;
-    safe_cuda(cudaGetDevice(&current_device));
-    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
-      << " ========";
-    LOG(CONSOLE) << "Peak memory usage: "
-      << stats_.peak_allocated_bytes / 1048576 << "MiB";
-    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
-  }
-};
-}  // namespace detail
-
-inline detail::MemoryLogger &GlobalMemoryLogger() {
-  static detail::MemoryLogger memory_logger;
-  return memory_logger;
-}
-
 // dh::DebugSyncDevice(__FILE__, __LINE__);
 inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
  if (file != "" && line != -1) {
@@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
  safe_cuda(cudaGetLastError());
 }

-namespace detail {
-
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-template <typename T>
-using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
-#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-template <typename T>
-using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-
-inline void ThrowOOMError(std::string const& err, size_t bytes) {
-  auto device = CurrentDevice();
-  auto rank = xgboost::collective::GetRank();
-  std::stringstream ss;
-  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
-     << "- Free memory: " << AvailableMemory(device) << "\n"
-     << "- Requested memory: " << bytes << std::endl;
-  LOG(FATAL) << ss.str();
-}
-
-/**
- * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
- */
-template <class T>
-struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
-  using SuperT = XGBBaseDeviceAllocator<T>;
-  using pointer = thrust::device_ptr<T>;  // NOLINT
-  template<typename U>
-  struct rebind  // NOLINT
-  {
-    using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
-  };
-  pointer allocate(size_t n) {  // NOLINT
-    pointer ptr;
-    try {
-      ptr = SuperT::allocate(n);
-      dh::safe_cuda(cudaGetLastError());
-    } catch (const std::exception &e) {
-      ThrowOOMError(e.what(), n * sizeof(T));
-    }
-    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
-    return ptr;
-  }
-  void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
-    SuperT::deallocate(ptr, n);
-  }
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBDefaultDeviceAllocatorImpl()
-    : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-};
-
-/**
- * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
- *        RMM pool allocator is enabled. Does not initialise memory on construction.
- */
-template <class T>
-struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
-  using SuperT = XGBBaseDeviceAllocator<T>;
-  using pointer = thrust::device_ptr<T>;  // NOLINT
-  template<typename U>
-  struct rebind  // NOLINT
-  {
-    using other = XGBCachingDeviceAllocatorImpl<U>;  // NOLINT
-  };
-  cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
-    // Configure allocator with maximum cached bin size of ~1GB and no limit on
-    // maximum cached bytes
-    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
-        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
-    return *allocator;
-  }
-  pointer allocate(size_t n) {  // NOLINT
-    pointer thrust_ptr;
-    if (use_cub_allocator_) {
-      T* raw_ptr{nullptr};
-      auto errc =  GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
-                                                              n * sizeof(T));
-      if (errc != cudaSuccess) {
-        ThrowOOMError("Caching allocator", n * sizeof(T));
-      }
-      thrust_ptr = pointer(raw_ptr);
-    } else {
-      try {
-        thrust_ptr = SuperT::allocate(n);
-        dh::safe_cuda(cudaGetLastError());
-      } catch (const std::exception &e) {
-        ThrowOOMError(e.what(), n * sizeof(T));
-      }
-    }
-    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
-    return thrust_ptr;
-  }
-  void deallocate(pointer ptr, size_t n) {  // NOLINT
-    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
-    if (use_cub_allocator_) {
-      GetGlobalCachingAllocator().DeviceFree(ptr.get());
-    } else {
-      SuperT::deallocate(ptr, n);
-    }
-  }
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBCachingDeviceAllocatorImpl()
-      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
-        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBOOST_DEVICE void construct(T *) {}  // NOLINT
- private:
-  bool use_cub_allocator_{true};
-};
-}  // namespace detail
-
-// Declare xgboost allocators
-// Replacement of allocator with custom backend should occur here
-template <typename T>
-using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
-/*! Be careful that the initialization constructor is a no-op, which means calling
- *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
- * `vec.resize(n, 0)`*/
-template <typename T>
-using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
-/** \brief Specialisation of thrust device vector using custom allocator. */
-template <typename T>
-using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
-template <typename T>
-using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
-
 // Faster to instantiate than caching_device_vector and invokes no synchronisation
 // Use this where vector functionality (e.g. resize) is not required
 template <typename T>
@@ -734,6 +508,11 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
  return ToSpan(vec, offset, size);
 }

+template <typename T>
+xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
+  return {thrust::raw_pointer_cast(vec.data()), vec.size()};
+}
+
 // thrust begin, similiar to std::begin
 template <typename T>
 thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) {  // NOLINT
@@ -1117,6 +896,15 @@ class CUDAStream {
  void Sync() { this->View().Sync(); }
 };

+inline auto CachingThrustPolicy() {
+  XGBCachingDeviceAllocator<char> alloc;
+#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
+  return thrust::cuda::par_nosync(alloc).on(DefaultStream());
+#else
+  return thrust::cuda::par(alloc).on(DefaultStream());
+#endif  // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
+}
+
 // Force nvcc to load data as constant
 template <typename T>
 class LDGIterator {
--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2017-2024, XGBoost contributors
+ */
+#include "../collective/communicator-inl.h"  // for GetRank
+#include "device_helpers.cuh"                // for CurrentDevice
+#include "device_vector.cuh"
+
+namespace dh {
+namespace detail {
+void ThrowOOMError(std::string const &err, size_t bytes) {
+  auto device = CurrentDevice();
+  auto rank = xgboost::collective::GetRank();
+  std::stringstream ss;
+  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
+     << "- Free memory: " << dh::AvailableMemory(device) << "\n"
+     << "- Requested memory: " << bytes << std::endl;
+  LOG(FATAL) << ss.str();
+}
+}  // namespace detail
+
+#if defined(XGBOOST_USE_RMM)
+LoggingResource *GlobalLoggingResource() {
+  static auto mr{std::make_unique<LoggingResource>()};
+  return mr.get();
+}
+#endif  // defined(XGBOOST_USE_RMM)
+}  // namespace dh
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@@ -0,0 +1,330 @@
+/**
+ * Copyright 2017-2024, XGBoost Contributors
+ */
+#pragma once
+#include <thrust/device_malloc_allocator.h>  // for device_malloc_allocator
+#include <thrust/device_ptr.h>               // for device_ptr
+#include <thrust/device_vector.h>            // for device_vector
+
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include <rmm/device_uvector.hpp>                      // for device_uvector
+#include <rmm/exec_policy.hpp>                         // for exec_policy_nosync
+#include <rmm/mr/device/device_memory_resource.hpp>    // for device_memory_resource
+#include <rmm/mr/device/per_device_resource.hpp>       // for get_current_device_resource
+#include <rmm/mr/device/thrust_allocator_adaptor.hpp>  // for thrust_allocator
+#include <rmm/version_config.hpp>                      // for RMM_VERSION_MAJOR
+
+#include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
+
+#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
+
+#error "Please use RMM version 0.18 or later"
+#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
+#error "Please use RMM version 0.18 or later"
+#endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
+
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
+#include <cstddef>                 // for size_t
+#include <cub/util_allocator.cuh>  // for CachingDeviceAllocator
+#include <cub/util_device.cuh>     // for CurrentDevice
+#include <map>                     // for map
+#include <memory>                  // for unique_ptr
+
+#include "common.h"  // for safe_cuda
+#include "xgboost/logging.h"
+
+namespace dh {
+namespace detail {
+/** \brief Keeps track of global device memory allocations. Thread safe.*/
+class MemoryLogger {
+  // Information for a single device
+  struct DeviceStats {
+    std::size_t currently_allocated_bytes{0};
+    size_t peak_allocated_bytes{0};
+    size_t num_allocations{0};
+    size_t num_deallocations{0};
+    std::map<void *, size_t> device_allocations;
+    void RegisterAllocation(void *ptr, size_t n) {
+      device_allocations[ptr] = n;
+      currently_allocated_bytes += n;
+      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
+      num_allocations++;
+      CHECK_GT(num_allocations, num_deallocations);
+    }
+    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
+      auto itr = device_allocations.find(ptr);
+      if (itr == device_allocations.end()) {
+        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
+                     << " that was never allocated\n"
+                     << dmlc::StackTrace();
+      } else {
+        num_deallocations++;
+        CHECK_LE(num_deallocations, num_allocations);
+        currently_allocated_bytes -= itr->second;
+        device_allocations.erase(itr);
+      }
+    }
+  };
+  DeviceStats stats_;
+  std::mutex mutex_;
+
+ public:
+  void RegisterAllocation(void *ptr, size_t n) {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    stats_.RegisterAllocation(ptr, n);
+  }
+  void RegisterDeallocation(void *ptr, size_t n) {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
+  }
+  size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
+  size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; }
+  void Clear() { stats_ = DeviceStats(); }
+
+  void Log() {
+    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    int current_device;
+    dh::safe_cuda(cudaGetDevice(&current_device));
+    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
+                 << " ========";
+    LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
+    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
+  }
+};
+
+void ThrowOOMError(std::string const &err, size_t bytes);
+}  // namespace detail
+
+inline detail::MemoryLogger &GlobalMemoryLogger() {
+  static detail::MemoryLogger memory_logger;
+  return memory_logger;
+}
+
+namespace detail {
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
+#else   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
+/**
+ * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
+ */
+template <class T>
+struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+  using SuperT = XGBBaseDeviceAllocator<T>;
+  using pointer = thrust::device_ptr<T>;  // NOLINT
+  template <typename U>
+  struct rebind  // NOLINT
+  {
+    using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
+  };
+  pointer allocate(size_t n) {  // NOLINT
+    pointer ptr;
+    try {
+      ptr = SuperT::allocate(n);
+      dh::safe_cuda(cudaGetLastError());
+    } catch (const std::exception &e) {
+      detail::ThrowOOMError(e.what(), n * sizeof(T));
+    }
+    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
+    return ptr;
+  }
+  void deallocate(pointer ptr, size_t n) {  // NOLINT
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    SuperT::deallocate(ptr, n);
+  }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBDefaultDeviceAllocatorImpl()
+      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+};
+
+/**
+ * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
+ *        RMM pool allocator is enabled. Does not initialise memory on construction.
+ */
+template <class T>
+struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+  using SuperT = XGBBaseDeviceAllocator<T>;
+  using pointer = thrust::device_ptr<T>;  // NOLINT
+  template <typename U>
+  struct rebind  // NOLINT
+  {
+    using other = XGBCachingDeviceAllocatorImpl<U>;  // NOLINT
+  };
+  cub::CachingDeviceAllocator &GetGlobalCachingAllocator() {
+    // Configure allocator with maximum cached bin size of ~1GB and no limit on
+    // maximum cached bytes
+    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
+        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
+    return *allocator;
+  }
+  pointer allocate(size_t n) {  // NOLINT
+    pointer thrust_ptr;
+    if (use_cub_allocator_) {
+      T *raw_ptr{nullptr};
+      auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
+                                                             n * sizeof(T));
+      if (errc != cudaSuccess) {
+        detail::ThrowOOMError("Caching allocator", n * sizeof(T));
+      }
+      thrust_ptr = pointer(raw_ptr);
+    } else {
+      try {
+        thrust_ptr = SuperT::allocate(n);
+        dh::safe_cuda(cudaGetLastError());
+      } catch (const std::exception &e) {
+        detail::ThrowOOMError(e.what(), n * sizeof(T));
+      }
+    }
+    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
+    return thrust_ptr;
+  }
+  void deallocate(pointer ptr, size_t n) {  // NOLINT
+    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
+    if (use_cub_allocator_) {
+      GetGlobalCachingAllocator().DeviceFree(ptr.get());
+    } else {
+      SuperT::deallocate(ptr, n);
+    }
+  }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBCachingDeviceAllocatorImpl()
+      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
+        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
+#endif                                   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBOOST_DEVICE void construct(T *) {}  // NOLINT
+ private:
+  bool use_cub_allocator_{true};
+};
+}  // namespace detail
+
+// Declare xgboost allocators
+// Replacement of allocator with custom backend should occur here
+template <typename T>
+using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
+
+/** Be careful that the initialization constructor is a no-op, which means calling
+ *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
+ * `vec.resize(n, 0)`
+ */
+template <typename T>
+using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
+
+/** @brief Specialisation of thrust device vector using custom allocator. */
+template <typename T>
+using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
+template <typename T>
+using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
+
+#if defined(XGBOOST_USE_RMM)
+/**
+ * @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead.
+ */
+class LoggingResource : public rmm::mr::device_memory_resource {
+  rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
+
+ public:
+  LoggingResource() = default;
+  ~LoggingResource() override = default;
+  LoggingResource(LoggingResource const &) = delete;
+  LoggingResource &operator=(LoggingResource const &) = delete;
+  LoggingResource(LoggingResource &&) noexcept = default;
+  LoggingResource &operator=(LoggingResource &&) noexcept = default;
+
+  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept {  // NOLINT
+    return mr_;
+  }
+  [[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept {  // NOLINT
+    return mr_;
+  }
+
+  void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override {  // NOLINT
+    try {
+      auto const ptr = mr_->allocate(bytes, stream);
+      GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
+      return ptr;
+    } catch (rmm::bad_alloc const &e) {
+      detail::ThrowOOMError(e.what(), bytes);
+    }
+    return nullptr;
+  }
+
+  void do_deallocate(void *ptr, std::size_t bytes,  // NOLINT
+                     rmm::cuda_stream_view stream) override {
+    mr_->deallocate(ptr, bytes, stream);
+    GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
+  }
+
+  [[nodiscard]] bool do_is_equal(  // NOLINT
+      device_memory_resource const &other) const noexcept override {
+    if (this == &other) {
+      return true;
+    }
+    auto const *cast = dynamic_cast<LoggingResource const *>(&other);
+    if (cast == nullptr) {
+      return mr_->is_equal(other);
+    }
+    return get_upstream_resource() == cast->get_upstream_resource();
+  }
+};
+
+LoggingResource *GlobalLoggingResource();
+
+/**
+ * @brief Container class that doesn't initialize the data.
+ */
+template <typename T>
+class DeviceUVector : public rmm::device_uvector<T> {
+  using Super = rmm::device_uvector<T>;
+
+ public:
+  DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
+
+  void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
+  void Resize(std::size_t n, T const &v) {
+    auto orig = this->size();
+    Super::resize(n, rmm::cuda_stream_per_thread);
+    if (orig < n) {
+      thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
+    }
+  }
+
+ private:
+  // undefined private, cannot be accessed.
+  void resize(std::size_t n, rmm::cuda_stream_view stream);  // NOLINT
+};
+
+#else
+
+/**
+ * @brief Without RMM, the initialization will happen.
+ */
+template <typename T>
+class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
+  using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
+
+ public:
+  void Resize(std::size_t n) { Super::resize(n); }
+  void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
+
+ private:
+  // undefined private, cannot be accessed.
+  void resize(std::size_t n, T const &v = T{});  // NOLINT
+};
+
+#endif  // defined(XGBOOST_USE_RMM)
+}  // namespace dh
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -114,6 +114,11 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->Vec().resize(new_size, v);
 }

+template <typename T>
+void HostDeviceVector<T>::Resize(size_t new_size) {
+  impl_->Vec().resize(new_size, T{});
+}
+
 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
  std::fill(HostVector().begin(), HostVector().end(), v);
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -1,16 +1,17 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
 */
 #include <thrust/fill.h>
-#include <thrust/device_ptr.h>

 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <cstdint>

+#include "device_helpers.cuh"
+#include "device_vector.cuh"  // for DeviceUVector
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
-#include "xgboost/tree_model.h"
-#include "device_helpers.cuh"
+#include "xgboost/tree_model.h"  // for RegTree

 namespace xgboost {

@@ -28,7 +29,7 @@ class HostDeviceVectorImpl {
    if (device.IsCUDA()) {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      data_d_->resize(size, v);
+      data_d_->Resize(size, v);
    } else {
      data_h_.resize(size, v);
    }
@@ -66,22 +67,22 @@ class HostDeviceVectorImpl {

  T* DevicePointer() {
    LazySyncDevice(GPUAccess::kWrite);
-    return data_d_->data().get();
+    return thrust::raw_pointer_cast(data_d_->data());
  }

  const T* ConstDevicePointer() {
    LazySyncDevice(GPUAccess::kRead);
-    return data_d_->data().get();
+    return thrust::raw_pointer_cast(data_d_->data());
  }

  common::Span<T> DeviceSpan() {
    LazySyncDevice(GPUAccess::kWrite);
-    return {data_d_->data().get(), Size()};
+    return {this->DevicePointer(), Size()};
  }

  common::Span<const T> ConstDeviceSpan() {
    LazySyncDevice(GPUAccess::kRead);
-    return {data_d_->data().get(), Size()};
+    return {this->ConstDevicePointer(), Size()};
  }

  void Fill(T v) {  // NOLINT
@@ -91,7 +92,7 @@ class HostDeviceVectorImpl {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
      auto s_data = dh::ToSpan(*data_d_);
-      dh::LaunchN(data_d_->size(),
+      dh::LaunchN(data_d_->size(), dh::DefaultStream(),
                  [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
    }
  }
@@ -128,7 +129,7 @@ class HostDeviceVectorImpl {

  void Extend(HostDeviceVectorImpl* other) {
    auto ori_size = this->Size();
-    this->Resize(ori_size + other->Size(), T());
+    this->Resize(ori_size + other->Size(), T{});
    if (HostCanWrite() && other->HostCanRead()) {
      auto& h_vec = this->HostVector();
      auto& other_vec = other->HostVector();
@@ -138,10 +139,9 @@ class HostDeviceVectorImpl {
      auto ptr = other->ConstDevicePointer();
      SetDevice();
      CHECK_EQ(this->Device(), other->Device());
-      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
-                                    ptr,
-                                    other->Size() * sizeof(T),
-                                    cudaMemcpyDeviceToDevice));
+      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
+                                    other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
+                                    dh::DefaultStream()));
    }
  }

@@ -171,17 +171,22 @@ class HostDeviceVectorImpl {
    }
  }

-  void Resize(size_t new_size, T v) {
-    if (new_size == Size()) { return; }
+  template <typename... U>
+  auto Resize(std::size_t new_size, U&&... args) {
+    if (new_size == Size()) {
+      return;
+    }
    if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
      // fast on-device resize
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      data_d_->resize(new_size, v);
+      auto old_size = data_d_->size();
+      data_d_->Resize(new_size, std::forward<U>(args)...);
    } else {
      // resize on host
      LazySyncHost(GPUAccess::kNone);
-      data_h_.resize(new_size, v);
+      auto old_size = data_h_.size();
+      data_h_.resize(new_size, std::forward<U>(args)...);
    }
  }

@@ -195,10 +200,8 @@ class HostDeviceVectorImpl {
    gpu_access_ = access;
    if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
    SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_h_.data(),
-                             data_d_->data().get(),
-                             data_d_->size() * sizeof(T),
-                             cudaMemcpyDeviceToHost));
+    dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
+                             data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
  }

  void LazySyncDevice(GPUAccess access) {
@@ -211,10 +214,9 @@ class HostDeviceVectorImpl {
    // data is on the host
    LazyResizeDevice(data_h_.size());
    SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
-                                  data_h_.data(),
-                                  data_d_->size() * sizeof(T),
-                                  cudaMemcpyHostToDevice));
+    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
+                                  data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
+                                  dh::DefaultStream()));
    gpu_access_ = access;
  }

@@ -229,7 +231,7 @@ class HostDeviceVectorImpl {
 private:
  DeviceOrd device_{DeviceOrd::CPU()};
  std::vector<T> data_h_{};
-  std::unique_ptr<dh::device_vector<T>> data_d_{};
+  std::unique_ptr<dh::DeviceUVector<T>> data_d_{};
  GPUAccess gpu_access_{GPUAccess::kNone};

  void CopyToDevice(HostDeviceVectorImpl* other) {
@@ -239,8 +241,10 @@ class HostDeviceVectorImpl {
      LazyResizeDevice(Size());
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
-                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
+      dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
+                                    thrust::raw_pointer_cast(other->data_d_->data()),
+                                    data_d_->size() * sizeof(T), cudaMemcpyDefault,
+                                    dh::DefaultStream()));
    }
  }

@@ -248,14 +252,15 @@ class HostDeviceVectorImpl {
    LazyResizeDevice(Size());
    gpu_access_ = GPUAccess::kWrite;
    SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
-                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
+    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
+                                  data_d_->size() * sizeof(T), cudaMemcpyDefault,
+                                  dh::DefaultStream()));
  }

  void LazyResizeDevice(size_t new_size) {
    if (data_d_ && new_size == data_d_->size()) { return; }
    SetDevice();
-    data_d_->resize(new_size);
+    data_d_->Resize(new_size);
  }

  void SetDevice() {
@@ -267,7 +272,7 @@ class HostDeviceVectorImpl {
    }

    if (!data_d_) {
-      data_d_.reset(new dh::device_vector<T>);
+      data_d_.reset(new dh::DeviceUVector<T>{});
    }
  }
 };
@@ -397,7 +402,12 @@ void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
 }

 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size, T v) {
+void HostDeviceVector<T>::Resize(std::size_t new_size) {
+  impl_->Resize(new_size);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Resize(std::size_t new_size, T v) {
  impl_->Resize(new_size, v);
 }

@@ -427,5 +437,4 @@ template class HostDeviceVector<RTreeNodeStat>;
 */
 template class HostDeviceVector<std::size_t>;
 #endif  // defined(__APPLE__)
-
 }  // namespace xgboost
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -4,12 +4,14 @@
 #ifndef XGBOOST_COMMON_QUANTILE_CUH_
 #define XGBOOST_COMMON_QUANTILE_CUH_

-#include "xgboost/span.h"
-#include "xgboost/data.h"
+#include <thrust/logical.h>  // for any_of
+
+#include "categorical.h"
 #include "device_helpers.cuh"
 #include "quantile.h"
 #include "timer.h"
-#include "categorical.h"
+#include "xgboost/data.h"
+#include "xgboost/span.h"

 namespace xgboost {
 namespace common {
@@ -100,9 +102,9 @@ class SketchContainer {
    CHECK(device.IsCUDA());
    // Initialize Sketches for this dmatrix
    this->columns_ptr_.SetDevice(device_);
-    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_.Resize(num_columns + 1, 0);
    this->columns_ptr_b_.SetDevice(device_);
-    this->columns_ptr_b_.Resize(num_columns + 1);
+    this->columns_ptr_b_.Resize(num_columns + 1, 0);

    this->feature_types_.Resize(feature_types.Size());
    this->feature_types_.Copy(feature_types);
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -1,7 +1,8 @@
 /**
 * Copyright 2021-2024, XGBoost Contributors
 */
-#include <thrust/copy.h>  // for copy
+#include <thrust/copy.h>     // for copy
+#include <thrust/logical.h>  // for any_of
 #include <thrust/scan.h>

 #include <cassert>
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -841,9 +841,7 @@ class GPUHistMaker : public TreeUpdater {
    out["hist_train_param"] = ToJson(hist_maker_param_);
  }

-  ~GPUHistMaker() {  // NOLINT
-    dh::GlobalMemoryLogger().Log();
-  }
+  ~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }

  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@@ -0,0 +1,21 @@
+/**
+ * Copyright 2024, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/common/device_vector.cuh"
+#include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
+
+namespace dh {
+TEST(DeviceUVector, Basic) {
+  GlobalMemoryLogger().Clear();
+  std::int32_t verbosity{3};
+  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
+  DeviceUVector<float> uvec;
+  uvec.Resize(12);
+  auto peak = GlobalMemoryLogger().PeakMemory();
+  auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
+  ASSERT_EQ(peak, n_bytes);
+  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
+}
+}  // namespace dh
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 XGBoost contributors
+ * Copyright 2018-2024, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
@@ -181,4 +181,41 @@ TEST(HostDeviceVector, Empty) {
  ASSERT_FALSE(another.Empty());
  ASSERT_TRUE(vec.Empty());
 }
+
+TEST(HostDeviceVector, Resize) {
+  auto check = [&](HostDeviceVector<float> const& vec) {
+    auto const& h_vec = vec.ConstHostSpan();
+    for (std::size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(h_vec[i], i + 1);
+    }
+    for (std::size_t i = 4; i < vec.Size(); ++i) {
+      ASSERT_EQ(h_vec[i], 3.0);
+    }
+  };
+  {
+    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+    vec.SetDevice(DeviceOrd::CUDA(0));
+    vec.ConstDeviceSpan();
+    ASSERT_TRUE(vec.DeviceCanRead());
+    ASSERT_FALSE(vec.DeviceCanWrite());
+    vec.DeviceSpan();
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    check(vec);
+  }
+  {
+    HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)};
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.DeviceCanWrite());
+    check(vec);
+  }
+  {
+    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
+    ASSERT_TRUE(vec.HostCanWrite());
+    vec.Resize(7, 3.0f);
+    ASSERT_TRUE(vec.HostCanWrite());
+    check(vec);
+  }
+}
 }  // namespace xgboost::common
--- a/tests/cpp/data/test_array_interface.h
+++ b/tests/cpp/data/test_array_interface.h
@@ -1,15 +1,14 @@
-// Copyright (c) 2019 by Contributors
+/**
+ * Copyright 2019-2024, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>  // for device
+#include <thrust/sequence.h>          // for sequence
 #include <xgboost/data.h>
 #include <xgboost/json.h>
-#include <thrust/device_vector.h>
-
-#include <memory>
-#include "../../../src/common/bitfield.h"
-#include "../../../src/common/device_helpers.cuh"

 namespace xgboost {
-
 template <typename T>
 Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
                         thrust::device_vector<T>* out_d_data) {