Avoid thrust vector initialization. (#10544)

* Avoid thrust vector initialization. - Add a wrapper for rmm device uvector. - Split up the `Resize` method for HDV.
2024-07-11 17:29:27 +08:00 · 2024-07-11 17:29:27 +08:00 · 1ca4bfd20e
commit 1ca4bfd20e
parent 89da9f9741
13 changed files with 510 additions and 291 deletions
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@ -135,7 +135,9 @@ class HostDeviceVector {
  void SetDevice(DeviceOrd device) const;
-  void Resize(size_t new_size, T v = T());
+  void Resize(std::size_t new_size);
  /** @brief Resize and initialize the data if the new size is larger than the old size. */
  void Resize(std::size_t new_size, T v);
  using value_type = T;  // NOLINT
--- a/src/common/cuda_context.cuh
+++ b/src/common/cuda_context.cuh
@ -18,7 +18,7 @@ struct CUDAContext {
   * \brief Caching thrust policy.
   */
  auto CTP() const {
-#if THRUST_MAJOR_VERSION >= 2
+#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
    return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
 #else
    return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -1,26 +1,21 @@
 /**
- * Copyright 2017-2023 XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
 */
 #pragma once
-#include <thrust/binary_search.h>  // thrust::upper_bound
+#include <thrust/binary_search.h>                       // thrust::upper_bound
-#include <thrust/device_malloc_allocator.h>
+#include <thrust/device_ptr.h>                          // for device_ptr
-#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>                       // for device_vector
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>                    // thrust::seq
-#include <thrust/gather.h>                              // gather
+#include <thrust/iterator/discard_iterator.h>           // for discard_iterator
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>  // make_transform_output_iterator
 #include <thrust/logical.h>
 #include <thrust/sequence.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
 #include <thrust/transform_scan.h>
 #include <thrust/unique.h>
 #include <algorithm>
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
-#include <cub/util_allocator.cuh>
+#include <cub/util_type.cuh>  // for UnitWord
 #include <sstream>
 #include <string>
 #include <tuple>
@ -28,22 +23,14 @@
 #include "../collective/communicator-inl.h"
 #include "common.h"
 #include "device_vector.cuh"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
 #include "xgboost/span.h"
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#if defined(XGBOOST_USE_RMM)
-#include "rmm/mr/device/per_device_resource.hpp"
+#include <rmm/exec_policy.hpp>
-#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
+#endif  // defined(XGBOOST_USE_RMM)
 #include "rmm/version_config.hpp"
 #if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
 #error "Please use RMM version 0.18 or later"
 #elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
 #error "Please use RMM version 0.18 or later"
 #endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) {
  LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }
 namespace detail {
 /** \brief Keeps track of global device memory allocations. Thread safe.*/
 class MemoryLogger {
  // Information for a single device
  struct DeviceStats {
    size_t currently_allocated_bytes{ 0 };
    size_t peak_allocated_bytes{ 0 };
    size_t num_allocations{ 0 };
    size_t num_deallocations{ 0 };
    std::map<void *, size_t> device_allocations;
    void RegisterAllocation(void *ptr, size_t n) {
      device_allocations[ptr] = n;
      currently_allocated_bytes += n;
      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
      num_allocations++;
      CHECK_GT(num_allocations, num_deallocations);
    }
    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
      auto itr = device_allocations.find(ptr);
      if (itr == device_allocations.end()) {
        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
                     << " that was never allocated\n"
                     << dmlc::StackTrace();
      } else {
        num_deallocations++;
        CHECK_LE(num_deallocations, num_allocations);
        currently_allocated_bytes -= itr->second;
        device_allocations.erase(itr);
      }
    }
  };
  DeviceStats stats_;
  std::mutex mutex_;
 public:
  void RegisterAllocation(void *ptr, size_t n) {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      return;
    }
    std::lock_guard<std::mutex> guard(mutex_);
    int current_device;
    safe_cuda(cudaGetDevice(&current_device));
    stats_.RegisterAllocation(ptr, n);
  }
  void RegisterDeallocation(void *ptr, size_t n) {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      return;
    }
    std::lock_guard<std::mutex> guard(mutex_);
    int current_device;
    safe_cuda(cudaGetDevice(&current_device));
    stats_.RegisterDeallocation(ptr, n, current_device);
  }
  size_t PeakMemory() const {
    return stats_.peak_allocated_bytes;
  }
  size_t CurrentlyAllocatedBytes() const {
    return stats_.currently_allocated_bytes;
  }
  void Clear()
  {
    stats_ = DeviceStats();
  }
  void Log() {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      return;
    }
    std::lock_guard<std::mutex> guard(mutex_);
    int current_device;
    safe_cuda(cudaGetDevice(&current_device));
    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
      << " ========";
    LOG(CONSOLE) << "Peak memory usage: "
      << stats_.peak_allocated_bytes / 1048576 << "MiB";
    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
  }
 };
 }  // namespace detail
 inline detail::MemoryLogger &GlobalMemoryLogger() {
  static detail::MemoryLogger memory_logger;
  return memory_logger;
 }
 // dh::DebugSyncDevice(__FILE__, __LINE__);
 inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
  if (file != "" && line != -1) {
@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
  safe_cuda(cudaGetLastError());
 }
 namespace detail {
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 template <typename T>
 using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
 #else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 template <typename T>
 using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 inline void ThrowOOMError(std::string const& err, size_t bytes) {
  auto device = CurrentDevice();
  auto rank = xgboost::collective::GetRank();
  std::stringstream ss;
  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
     << "- Free memory: " << AvailableMemory(device) << "\n"
     << "- Requested memory: " << bytes << std::endl;
  LOG(FATAL) << ss.str();
 }
 /**
 * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
 */
 template <class T>
 struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
  using SuperT = XGBBaseDeviceAllocator<T>;
  using pointer = thrust::device_ptr<T>;  // NOLINT
  template<typename U>
  struct rebind  // NOLINT
  {
    using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
  };
  pointer allocate(size_t n) {  // NOLINT
    pointer ptr;
    try {
      ptr = SuperT::allocate(n);
      dh::safe_cuda(cudaGetLastError());
    } catch (const std::exception &e) {
      ThrowOOMError(e.what(), n * sizeof(T));
    }
    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
    return ptr;
  }
  void deallocate(pointer ptr, size_t n) {  // NOLINT
    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
    SuperT::deallocate(ptr, n);
  }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  XGBDefaultDeviceAllocatorImpl()
    : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 };
 /**
 * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
 *        RMM pool allocator is enabled. Does not initialise memory on construction.
 */
 template <class T>
 struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
  using SuperT = XGBBaseDeviceAllocator<T>;
  using pointer = thrust::device_ptr<T>;  // NOLINT
  template<typename U>
  struct rebind  // NOLINT
  {
    using other = XGBCachingDeviceAllocatorImpl<U>;  // NOLINT
  };
  cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
    // Configure allocator with maximum cached bin size of ~1GB and no limit on
    // maximum cached bytes
    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
    return *allocator;
  }
  pointer allocate(size_t n) {  // NOLINT
    pointer thrust_ptr;
    if (use_cub_allocator_) {
      T* raw_ptr{nullptr};
      auto errc =  GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
                                                              n * sizeof(T));
      if (errc != cudaSuccess) {
        ThrowOOMError("Caching allocator", n * sizeof(T));
      }
      thrust_ptr = pointer(raw_ptr);
    } else {
      try {
        thrust_ptr = SuperT::allocate(n);
        dh::safe_cuda(cudaGetLastError());
      } catch (const std::exception &e) {
        ThrowOOMError(e.what(), n * sizeof(T));
      }
    }
    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
    return thrust_ptr;
  }
  void deallocate(pointer ptr, size_t n) {  // NOLINT
    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
    if (use_cub_allocator_) {
      GetGlobalCachingAllocator().DeviceFree(ptr.get());
    } else {
      SuperT::deallocate(ptr, n);
    }
  }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  XGBCachingDeviceAllocatorImpl()
      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  XGBOOST_DEVICE void construct(T *) {}  // NOLINT
 private:
  bool use_cub_allocator_{true};
 };
 }  // namespace detail
 // Declare xgboost allocators
 // Replacement of allocator with custom backend should occur here
 template <typename T>
 using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
 /*! Be careful that the initialization constructor is a no-op, which means calling
 *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
 * `vec.resize(n, 0)`*/
 template <typename T>
 using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
 /** \brief Specialisation of thrust device vector using custom allocator. */
 template <typename T>
 using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
 template <typename T>
 using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
 // Faster to instantiate than caching_device_vector and invokes no synchronisation
 // Use this where vector functionality (e.g. resize) is not required
 template <typename T>
@ -734,6 +508,11 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
  return ToSpan(vec, offset, size);
 }
 template <typename T>
 xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
  return {thrust::raw_pointer_cast(vec.data()), vec.size()};
 }
 // thrust begin, similiar to std::begin
 template <typename T>
 thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) {  // NOLINT
@ -1117,6 +896,15 @@ class CUDAStream {
  void Sync() { this->View().Sync(); }
 };
 inline auto CachingThrustPolicy() {
  XGBCachingDeviceAllocator<char> alloc;
 #if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
  return thrust::cuda::par_nosync(alloc).on(DefaultStream());
 #else
  return thrust::cuda::par(alloc).on(DefaultStream());
 #endif  // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
 }
 // Force nvcc to load data as constant
 template <typename T>
 class LDGIterator {
--- a/src/common/device_vector.cu
+++ b/src/common/device_vector.cu
@ -0,0 +1,27 @@
 /**
 * Copyright 2017-2024, XGBoost contributors
 */
 #include "../collective/communicator-inl.h"  // for GetRank
 #include "device_helpers.cuh"                // for CurrentDevice
 #include "device_vector.cuh"
 namespace dh {
 namespace detail {
 void ThrowOOMError(std::string const &err, size_t bytes) {
  auto device = CurrentDevice();
  auto rank = xgboost::collective::GetRank();
  std::stringstream ss;
  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
     << "- Free memory: " << dh::AvailableMemory(device) << "\n"
     << "- Requested memory: " << bytes << std::endl;
  LOG(FATAL) << ss.str();
 }
 }  // namespace detail
 #if defined(XGBOOST_USE_RMM)
 LoggingResource *GlobalLoggingResource() {
  static auto mr{std::make_unique<LoggingResource>()};
  return mr.get();
 }
 #endif  // defined(XGBOOST_USE_RMM)
 }  // namespace dh
--- a/src/common/device_vector.cuh
+++ b/src/common/device_vector.cuh
@ -0,0 +1,330 @@
 /**
 * Copyright 2017-2024, XGBoost Contributors
 */
 #pragma once
 #include <thrust/device_malloc_allocator.h>  // for device_malloc_allocator
 #include <thrust/device_ptr.h>               // for device_ptr
 #include <thrust/device_vector.h>            // for device_vector
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #include <rmm/device_uvector.hpp>                      // for device_uvector
 #include <rmm/exec_policy.hpp>                         // for exec_policy_nosync
 #include <rmm/mr/device/device_memory_resource.hpp>    // for device_memory_resource
 #include <rmm/mr/device/per_device_resource.hpp>       // for get_current_device_resource
 #include <rmm/mr/device/thrust_allocator_adaptor.hpp>  // for thrust_allocator
 #include <rmm/version_config.hpp>                      // for RMM_VERSION_MAJOR
 #include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
 #if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
 #error "Please use RMM version 0.18 or later"
 #elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
 #error "Please use RMM version 0.18 or later"
 #endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #include <cstddef>                 // for size_t
 #include <cub/util_allocator.cuh>  // for CachingDeviceAllocator
 #include <cub/util_device.cuh>     // for CurrentDevice
 #include <map>                     // for map
 #include <memory>                  // for unique_ptr
 #include "common.h"  // for safe_cuda
 #include "xgboost/logging.h"
 namespace dh {
 namespace detail {
 /** \brief Keeps track of global device memory allocations. Thread safe.*/
 class MemoryLogger {
  // Information for a single device
  struct DeviceStats {
    std::size_t currently_allocated_bytes{0};
    size_t peak_allocated_bytes{0};
    size_t num_allocations{0};
    size_t num_deallocations{0};
    std::map<void *, size_t> device_allocations;
    void RegisterAllocation(void *ptr, size_t n) {
      device_allocations[ptr] = n;
      currently_allocated_bytes += n;
      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
      num_allocations++;
      CHECK_GT(num_allocations, num_deallocations);
    }
    void RegisterDeallocation(void *ptr, size_t n, int current_device) {
      auto itr = device_allocations.find(ptr);
      if (itr == device_allocations.end()) {
        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
                     << " that was never allocated\n"
                     << dmlc::StackTrace();
      } else {
        num_deallocations++;
        CHECK_LE(num_deallocations, num_allocations);
        currently_allocated_bytes -= itr->second;
        device_allocations.erase(itr);
      }
    }
  };
  DeviceStats stats_;
  std::mutex mutex_;
 public:
  void RegisterAllocation(void *ptr, size_t n) {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      return;
    }
    std::lock_guard<std::mutex> guard(mutex_);
    stats_.RegisterAllocation(ptr, n);
  }
  void RegisterDeallocation(void *ptr, size_t n) {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      return;
    }
    std::lock_guard<std::mutex> guard(mutex_);
    stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice());
  }
  size_t PeakMemory() const { return stats_.peak_allocated_bytes; }
  size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; }
  void Clear() { stats_ = DeviceStats(); }
  void Log() {
    if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
      return;
    }
    std::lock_guard<std::mutex> guard(mutex_);
    int current_device;
    dh::safe_cuda(cudaGetDevice(&current_device));
    LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: "
                 << " ========";
    LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB";
    LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations;
  }
 };
 void ThrowOOMError(std::string const &err, size_t bytes);
 }  // namespace detail
 inline detail::MemoryLogger &GlobalMemoryLogger() {
  static detail::MemoryLogger memory_logger;
  return memory_logger;
 }
 namespace detail {
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 template <typename T>
 using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
 #else   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 template <typename T>
 using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 /**
 * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
 */
 template <class T>
 struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
  using SuperT = XGBBaseDeviceAllocator<T>;
  using pointer = thrust::device_ptr<T>;  // NOLINT
  template <typename U>
  struct rebind  // NOLINT
  {
    using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
  };
  pointer allocate(size_t n) {  // NOLINT
    pointer ptr;
    try {
      ptr = SuperT::allocate(n);
      dh::safe_cuda(cudaGetLastError());
    } catch (const std::exception &e) {
      detail::ThrowOOMError(e.what(), n * sizeof(T));
    }
    GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
    return ptr;
  }
  void deallocate(pointer ptr, size_t n) {  // NOLINT
    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
    SuperT::deallocate(ptr, n);
  }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  XGBDefaultDeviceAllocatorImpl()
      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 };
 /**
 * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
 *        RMM pool allocator is enabled. Does not initialise memory on construction.
 */
 template <class T>
 struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
  using SuperT = XGBBaseDeviceAllocator<T>;
  using pointer = thrust::device_ptr<T>;  // NOLINT
  template <typename U>
  struct rebind  // NOLINT
  {
    using other = XGBCachingDeviceAllocatorImpl<U>;  // NOLINT
  };
  cub::CachingDeviceAllocator &GetGlobalCachingAllocator() {
    // Configure allocator with maximum cached bin size of ~1GB and no limit on
    // maximum cached bytes
    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
    return *allocator;
  }
  pointer allocate(size_t n) {  // NOLINT
    pointer thrust_ptr;
    if (use_cub_allocator_) {
      T *raw_ptr{nullptr};
      auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
                                                             n * sizeof(T));
      if (errc != cudaSuccess) {
        detail::ThrowOOMError("Caching allocator", n * sizeof(T));
      }
      thrust_ptr = pointer(raw_ptr);
    } else {
      try {
        thrust_ptr = SuperT::allocate(n);
        dh::safe_cuda(cudaGetLastError());
      } catch (const std::exception &e) {
        detail::ThrowOOMError(e.what(), n * sizeof(T));
      }
    }
    GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
    return thrust_ptr;
  }
  void deallocate(pointer ptr, size_t n) {  // NOLINT
    GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
    if (use_cub_allocator_) {
      GetGlobalCachingAllocator().DeviceFree(ptr.get());
    } else {
      SuperT::deallocate(ptr, n);
    }
  }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  XGBCachingDeviceAllocatorImpl()
      : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
        use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
 #endif                                   // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
  XGBOOST_DEVICE void construct(T *) {}  // NOLINT
 private:
  bool use_cub_allocator_{true};
 };
 }  // namespace detail
 // Declare xgboost allocators
 // Replacement of allocator with custom backend should occur here
 template <typename T>
 using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
 /** Be careful that the initialization constructor is a no-op, which means calling
 *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
 * `vec.resize(n, 0)`
 */
 template <typename T>
 using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
 /** @brief Specialisation of thrust device vector using custom allocator. */
 template <typename T>
 using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT
 template <typename T>
 using caching_device_vector = thrust::device_vector<T,  XGBCachingDeviceAllocator<T>>;  // NOLINT
 #if defined(XGBOOST_USE_RMM)
 /**
 * @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead.
 */
 class LoggingResource : public rmm::mr::device_memory_resource {
  rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()};
 public:
  LoggingResource() = default;
  ~LoggingResource() override = default;
  LoggingResource(LoggingResource const &) = delete;
  LoggingResource &operator=(LoggingResource const &) = delete;
  LoggingResource(LoggingResource &&) noexcept = default;
  LoggingResource &operator=(LoggingResource &&) noexcept = default;
  [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept {  // NOLINT
    return mr_;
  }
  [[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept {  // NOLINT
    return mr_;
  }
  void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override {  // NOLINT
    try {
      auto const ptr = mr_->allocate(bytes, stream);
      GlobalMemoryLogger().RegisterAllocation(ptr, bytes);
      return ptr;
    } catch (rmm::bad_alloc const &e) {
      detail::ThrowOOMError(e.what(), bytes);
    }
    return nullptr;
  }
  void do_deallocate(void *ptr, std::size_t bytes,  // NOLINT
                     rmm::cuda_stream_view stream) override {
    mr_->deallocate(ptr, bytes, stream);
    GlobalMemoryLogger().RegisterDeallocation(ptr, bytes);
  }
  [[nodiscard]] bool do_is_equal(  // NOLINT
      device_memory_resource const &other) const noexcept override {
    if (this == &other) {
      return true;
    }
    auto const *cast = dynamic_cast<LoggingResource const *>(&other);
    if (cast == nullptr) {
      return mr_->is_equal(other);
    }
    return get_upstream_resource() == cast->get_upstream_resource();
  }
 };
 LoggingResource *GlobalLoggingResource();
 /**
 * @brief Container class that doesn't initialize the data.
 */
 template <typename T>
 class DeviceUVector : public rmm::device_uvector<T> {
  using Super = rmm::device_uvector<T>;
 public:
  DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
  void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
  void Resize(std::size_t n, T const &v) {
    auto orig = this->size();
    Super::resize(n, rmm::cuda_stream_per_thread);
    if (orig < n) {
      thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
    }
  }
 private:
  // undefined private, cannot be accessed.
  void resize(std::size_t n, rmm::cuda_stream_view stream);  // NOLINT
 };
 #else
 /**
 * @brief Without RMM, the initialization will happen.
 */
 template <typename T>
 class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
  using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
 public:
  void Resize(std::size_t n) { Super::resize(n); }
  void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
 private:
  // undefined private, cannot be accessed.
  void resize(std::size_t n, T const &v = T{});  // NOLINT
 };
 #endif  // defined(XGBOOST_USE_RMM)
 }  // namespace dh
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@ -114,6 +114,11 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
  impl_->Vec().resize(new_size, v);
 }
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size) {
  impl_->Vec().resize(new_size, T{});
 }
 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
  std::fill(HostVector().begin(), HostVector().end(), v);
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -1,16 +1,17 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
 */
 #include <thrust/fill.h>
 #include <thrust/device_ptr.h>
 #include <algorithm>
 #include <cstddef>  // for size_t
 #include <cstdint>
 #include "device_helpers.cuh"
 #include "device_vector.cuh"  // for DeviceUVector
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
-#include "xgboost/tree_model.h"
+#include "xgboost/tree_model.h"  // for RegTree
 #include "device_helpers.cuh"
 namespace xgboost {
@ -28,7 +29,7 @@ class HostDeviceVectorImpl {
    if (device.IsCUDA()) {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      data_d_->resize(size, v);
+      data_d_->Resize(size, v);
    } else {
      data_h_.resize(size, v);
    }
@ -66,22 +67,22 @@ class HostDeviceVectorImpl {
  T* DevicePointer() {
    LazySyncDevice(GPUAccess::kWrite);
-    return data_d_->data().get();
+    return thrust::raw_pointer_cast(data_d_->data());
  }
  const T* ConstDevicePointer() {
    LazySyncDevice(GPUAccess::kRead);
-    return data_d_->data().get();
+    return thrust::raw_pointer_cast(data_d_->data());
  }
  common::Span<T> DeviceSpan() {
    LazySyncDevice(GPUAccess::kWrite);
-    return {data_d_->data().get(), Size()};
+    return {this->DevicePointer(), Size()};
  }
  common::Span<const T> ConstDeviceSpan() {
    LazySyncDevice(GPUAccess::kRead);
-    return {data_d_->data().get(), Size()};
+    return {this->ConstDevicePointer(), Size()};
  }
  void Fill(T v) {  // NOLINT
@ -91,7 +92,7 @@ class HostDeviceVectorImpl {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
      auto s_data = dh::ToSpan(*data_d_);
-      dh::LaunchN(data_d_->size(),
+      dh::LaunchN(data_d_->size(), dh::DefaultStream(),
                  [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; });
    }
  }
@ -128,7 +129,7 @@ class HostDeviceVectorImpl {
  void Extend(HostDeviceVectorImpl* other) {
    auto ori_size = this->Size();
-    this->Resize(ori_size + other->Size(), T());
+    this->Resize(ori_size + other->Size(), T{});
    if (HostCanWrite() && other->HostCanRead()) {
      auto& h_vec = this->HostVector();
      auto& other_vec = other->HostVector();
@ -138,10 +139,9 @@ class HostDeviceVectorImpl {
      auto ptr = other->ConstDevicePointer();
      SetDevice();
      CHECK_EQ(this->Device(), other->Device());
-      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
+      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr,
-                                    ptr,
+                                    other->Size() * sizeof(T), cudaMemcpyDeviceToDevice,
-                                    other->Size() * sizeof(T),
+                                    dh::DefaultStream()));
                                    cudaMemcpyDeviceToDevice));
    }
  }
@ -171,17 +171,22 @@ class HostDeviceVectorImpl {
    }
  }
-  void Resize(size_t new_size, T v) {
+  template <typename... U>
-    if (new_size == Size()) { return; }
+  auto Resize(std::size_t new_size, U&&... args) {
    if (new_size == Size()) {
      return;
    }
    if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
      // fast on-device resize
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      data_d_->resize(new_size, v);
+      auto old_size = data_d_->size();
      data_d_->Resize(new_size, std::forward<U>(args)...);
    } else {
      // resize on host
      LazySyncHost(GPUAccess::kNone);
-      data_h_.resize(new_size, v);
+      auto old_size = data_h_.size();
      data_h_.resize(new_size, std::forward<U>(args)...);
    }
  }
@ -195,10 +200,8 @@ class HostDeviceVectorImpl {
    gpu_access_ = access;
    if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
    SetDevice();
-    dh::safe_cuda(cudaMemcpy(data_h_.data(),
+    dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
-                             data_d_->data().get(),
+                             data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
                             data_d_->size() * sizeof(T),
                             cudaMemcpyDeviceToHost));
  }
  void LazySyncDevice(GPUAccess access) {
@ -211,10 +214,9 @@ class HostDeviceVectorImpl {
    // data is on the host
    LazyResizeDevice(data_h_.size());
    SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
+    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
-                                  data_h_.data(),
+                                  data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
-                                  data_d_->size() * sizeof(T),
+                                  dh::DefaultStream()));
                                  cudaMemcpyHostToDevice));
    gpu_access_ = access;
  }
@ -229,7 +231,7 @@ class HostDeviceVectorImpl {
 private:
  DeviceOrd device_{DeviceOrd::CPU()};
  std::vector<T> data_h_{};
-  std::unique_ptr<dh::device_vector<T>> data_d_{};
+  std::unique_ptr<dh::DeviceUVector<T>> data_d_{};
  GPUAccess gpu_access_{GPUAccess::kNone};
  void CopyToDevice(HostDeviceVectorImpl* other) {
@ -239,8 +241,10 @@ class HostDeviceVectorImpl {
      LazyResizeDevice(Size());
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
-      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
+      dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
-                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
+                                    thrust::raw_pointer_cast(other->data_d_->data()),
                                    data_d_->size() * sizeof(T), cudaMemcpyDefault,
                                    dh::DefaultStream()));
    }
  }
@ -248,14 +252,15 @@ class HostDeviceVectorImpl {
    LazyResizeDevice(Size());
    gpu_access_ = GPUAccess::kWrite;
    SetDevice();
-    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
+    dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
-                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
+                                  data_d_->size() * sizeof(T), cudaMemcpyDefault,
                                  dh::DefaultStream()));
  }
  void LazyResizeDevice(size_t new_size) {
    if (data_d_ && new_size == data_d_->size()) { return; }
    SetDevice();
-    data_d_->resize(new_size);
+    data_d_->Resize(new_size);
  }
  void SetDevice() {
@ -267,7 +272,7 @@ class HostDeviceVectorImpl {
    }
    if (!data_d_) {
-      data_d_.reset(new dh::device_vector<T>);
+      data_d_.reset(new dh::DeviceUVector<T>{});
    }
  }
 };
@ -397,7 +402,12 @@ void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
 }
 template <typename T>
-void HostDeviceVector<T>::Resize(size_t new_size, T v) {
+void HostDeviceVector<T>::Resize(std::size_t new_size) {
  impl_->Resize(new_size);
 }
 template <typename T>
 void HostDeviceVector<T>::Resize(std::size_t new_size, T v) {
  impl_->Resize(new_size, v);
 }
@ -427,5 +437,4 @@ template class HostDeviceVector<RTreeNodeStat>;
 */
 template class HostDeviceVector<std::size_t>;
 #endif  // defined(__APPLE__)
 }  // namespace xgboost
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@ -4,12 +4,14 @@
 #ifndef XGBOOST_COMMON_QUANTILE_CUH_
 #define XGBOOST_COMMON_QUANTILE_CUH_
-#include "xgboost/span.h"
+#include <thrust/logical.h>  // for any_of
-#include "xgboost/data.h"
+
 #include "categorical.h"
 #include "device_helpers.cuh"
 #include "quantile.h"
 #include "timer.h"
-#include "categorical.h"
+#include "xgboost/data.h"
 #include "xgboost/span.h"
 namespace xgboost {
 namespace common {
@ -100,9 +102,9 @@ class SketchContainer {
    CHECK(device.IsCUDA());
    // Initialize Sketches for this dmatrix
    this->columns_ptr_.SetDevice(device_);
-    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_.Resize(num_columns + 1, 0);
    this->columns_ptr_b_.SetDevice(device_);
-    this->columns_ptr_b_.Resize(num_columns + 1);
+    this->columns_ptr_b_.Resize(num_columns + 1, 0);
    this->feature_types_.Resize(feature_types.Size());
    this->feature_types_.Copy(feature_types);
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@ -1,7 +1,8 @@
 /**
 * Copyright 2021-2024, XGBoost Contributors
 */
-#include <thrust/copy.h>  // for copy
+#include <thrust/copy.h>     // for copy
 #include <thrust/logical.h>  // for any_of
 #include <thrust/scan.h>
 #include <cassert>
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -841,9 +841,7 @@ class GPUHistMaker : public TreeUpdater {
    out["hist_train_param"] = ToJson(hist_maker_param_);
  }
-  ~GPUHistMaker() {  // NOLINT
+  ~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }
    dh::GlobalMemoryLogger().Log();
  }
  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
--- a/tests/cpp/common/test_device_vector.cu
+++ b/tests/cpp/common/test_device_vector.cu
@ -0,0 +1,21 @@
 /**
 * Copyright 2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include "../../../src/common/device_vector.cuh"
 #include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
 namespace dh {
 TEST(DeviceUVector, Basic) {
  GlobalMemoryLogger().Clear();
  std::int32_t verbosity{3};
  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
  DeviceUVector<float> uvec;
  uvec.Resize(12);
  auto peak = GlobalMemoryLogger().PeakMemory();
  auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
  ASSERT_EQ(peak, n_bytes);
  std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
 }
 }  // namespace dh
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 XGBoost contributors
+ * Copyright 2018-2024, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
@ -181,4 +181,41 @@ TEST(HostDeviceVector, Empty) {
  ASSERT_FALSE(another.Empty());
  ASSERT_TRUE(vec.Empty());
 }
 TEST(HostDeviceVector, Resize) {
  auto check = [&](HostDeviceVector<float> const& vec) {
    auto const& h_vec = vec.ConstHostSpan();
    for (std::size_t i = 0; i < 4; ++i) {
      ASSERT_EQ(h_vec[i], i + 1);
    }
    for (std::size_t i = 4; i < vec.Size(); ++i) {
      ASSERT_EQ(h_vec[i], 3.0);
    }
  };
  {
    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
    vec.SetDevice(DeviceOrd::CUDA(0));
    vec.ConstDeviceSpan();
    ASSERT_TRUE(vec.DeviceCanRead());
    ASSERT_FALSE(vec.DeviceCanWrite());
    vec.DeviceSpan();
    vec.Resize(7, 3.0f);
    ASSERT_TRUE(vec.DeviceCanWrite());
    check(vec);
  }
  {
    HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)};
    ASSERT_TRUE(vec.DeviceCanWrite());
    vec.Resize(7, 3.0f);
    ASSERT_TRUE(vec.DeviceCanWrite());
    check(vec);
  }
  {
    HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
    ASSERT_TRUE(vec.HostCanWrite());
    vec.Resize(7, 3.0f);
    ASSERT_TRUE(vec.HostCanWrite());
    check(vec);
  }
 }
 }  // namespace xgboost::common
--- a/tests/cpp/data/test_array_interface.h
+++ b/tests/cpp/data/test_array_interface.h
@ -1,15 +1,14 @@
-// Copyright (c) 2019 by Contributors
+/**
 * Copyright 2019-2024, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>  // for device
 #include <thrust/sequence.h>          // for sequence
 #include <xgboost/data.h>
 #include <xgboost/json.h>
 #include <thrust/device_vector.h>
 #include <memory>
 #include "../../../src/common/bitfield.h"
 #include "../../../src/common/device_helpers.cuh"
 namespace xgboost {
 template <typename T>
 Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
                         thrust::device_vector<T>* out_d_data) {