add cuda to hip wrapper

2023-10-17 12:42:37 -07:00 · 2023-10-17 12:42:37 -07:00 · ffbbc9c968
commit ffbbc9c968
parent ea19555474
35 changed files with 60 additions and 509 deletions
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@ -59,21 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) {
 void XGBoostAPIGuard::SetGPUAttribute() {
  // Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
  // If errors, do nothing, assuming running on CPU only machine.
 #if defined(XGBOOST_USE_CUDA)
  cudaGetDevice(&device_id_);
 #elif defined(XGBOOST_USE_HIP)
  hipGetDevice(&device_id_);
 #endif
 }
 void XGBoostAPIGuard::RestoreGPUAttribute() {
  // Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
  // If errors, do nothing, assuming running on CPU only machine.
 #if defined(XGBOOST_USE_CUDA)
  cudaSetDevice(device_id_);
 #elif defined(XGBOOST_USE_HIP)
  hipSetDevice(device_id_);
 #endif
 }
 void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@ -26,22 +26,12 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
      return;
    }
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(device_ordinal_));
 #endif
    auto size = count * GetTypeSize(data_type);
    host_buffer_.resize(size);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
    Allreduce(host_buffer_.data(), count, data_type, op);
    dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpy(host_buffer_.data(), send_receive_buffer, size, hipMemcpyDefault));
    AllReduce(host_buffer_.data(), count, data_type, op);
    dh::safe_cuda(hipMemcpy(send_receive_buffer, host_buffer_.data(), size, hipMemcpyDefault));
 #endif
  }
  void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
@ -49,7 +39,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
      return;
    }
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
    host_buffer_.resize(send_size * world_size_);
    dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
@ -57,15 +46,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
    Allgather(host_buffer_.data(), host_buffer_.size());
    dh::safe_cuda(
        cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(device_ordinal_));
    host_buffer_.resize(send_size * world_size_);
    dh::safe_cuda(hipMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
                             hipMemcpyDefault));
    Allgather(host_buffer_.data(), host_buffer_.size());
    dh::safe_cuda(
        hipMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), hipMemcpyDefault));
 #endif
  }
  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
@ -74,11 +54,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
      return;
    }
 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(device_ordinal_));
 #elif defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(device_ordinal_));
 #endif
    segments->clear();
    segments->resize(world_size_, 0);
@ -92,25 +68,15 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
    for (int32_t i = 0; i < world_size_; ++i) {
      size_t as_bytes = segments->at(i);
      if (i == rank_) {
 #if defined(XGBOOST_USE_CUDA)
        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
                                 cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
        dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
                                 hipMemcpyDefault));
 #endif
      }
      Broadcast(host_buffer_.data() + offset, as_bytes, i);
      offset += as_bytes;
    }
 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
                             hipMemcpyDefault));
 #elif defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
                             cudaMemcpyDefault));
 #endif
  }
  void Synchronize() override {
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@ -185,13 +185,8 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
 #if defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
                                sorted_idx.size_bytes(), hipMemcpyDeviceToDevice));
 #elif defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
 #endif
 }
 /**
--- a/src/common/common.cu
+++ b/src/common/common.cu
@ -2,17 +2,14 @@
 * Copyright 2018-2022 XGBoost contributors
 */
 #include "common.h"
 #include "cuda_to_hip.h"
 namespace xgboost {
 namespace common {
 void SetDevice(std::int32_t device) {
  if (device >= 0) {
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(device));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(device));
 #endif
  }
 }
@ -21,17 +18,9 @@ int AllVisibleGPUs() {
  try {
    // When compiled with CUDA but running on CPU only device,
    // cudaGetDeviceCount will fail.
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipGetDeviceCount(&n_visgpus));
 #endif
  } catch (const dmlc::Error &) {
 #if defined(XGBOOST_USE_CUDA)
    cudaGetLastError();  // reset error.
 #elif defined(XGBOOST_USE_HIP)
    hipGetLastError();  // reset error.
 #endif
    return 0;
  }
  return n_visgpus;
--- a/src/common/cuda_to_hip.h
+++ b/src/common/cuda_to_hip.h
@ -0,0 +1,57 @@
 /**
 * Copyright 2017-2023 XGBoost contributors
 */
 #pragma once
 #if defined(XGBOOST_USE_HIP)
 #define cudaSuccess                  hipSuccess
 #define cudaGetLastError             hipGetLastError
 #define cudaStream_t                 hipStream_t
 #define cudaStreamCreate             hipStreamCreate
 #define cudaStreamCreateWithFlags    hipStreamCreateWithFlags
 #define cudaStreamDestroy            hipStreamDestroy
 #define cudaStreamWaitEvent          hipStreamWaitEvent
 #define cudaStreamSynchronize        hipStreamSynchronize
 #define cudaStreamPerThread          hipStreamPerThread
 #define cudaStreamLegacy             hipStreamLegacy
 #define cudaEvent_t                  hipEvent_t
 #define cudaEventCreate              hipEventCreate
 #define cudaEventCreateWithFlags     hipEventCreateWithFlags
 #define cudaEventDestroy             hipEventDestroy
 #define cudaGetDevice                hipGetDevice
 #define cudaSetDevice                hipSetDevice
 #define cudaGetDeviceCount           hipGetDeviceCount
 #define cudaDeviceSynchronize        hipDeviceSynchronize
 #define cudaGetDeviceProperties      hipGetDeviceProperties
 #define cudaDeviceGetAttribute       hipDeviceGetAttribute
 #define cudaMallocHost               hipMallocHost
 #define cudaFreeHost                 hipFreeHost
 #define cudaMalloc                   hipMalloc
 #define cudaFree                     hipFree
 #define cudaMemcpy                   hipMemcpy
 #define cudaMemcpyAsync              hipMemcpyAsync
 #define cudaMemcpyDefault            hipMemcpyDefault
 #define cudaMemcpyHostToDevice       hipMemcpyHostToDevice
 #define cudaMemcpyHostToHost         hipMemcpyHostToHost
 #define cudaMemcpyDeviceToHost       hipMemcpyDeviceToHost
 #define cudaMemcpyDeviceToDevice     hipMemcpyDeviceToDevice
 #define cudaMemsetAsync              hipMemsetAsync
 #define cudaMemset                   hipMemset
 #define cudaPointerAttributes        hipPointerAttribute_t 
 #define cudaPointerGetAttributes     hipPointerGetAttributes
 #define cudaMemGetInfo               hipMemGetInfo
 #define cudaFuncSetAttribute         hipFuncSetAttribute
 #define cudaDevAttrMultiProcessorCount                hipDeviceAttributeMultiprocessorCount
 #define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
 #endif
--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
@ -31,6 +31,8 @@
 #include <tuple>
 #include <vector>
 #include "cuda_to_hip.h"
 #include "../collective/communicator-inl.h"
 #include "common.h"
 #include "xgboost/global_config.h"
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -330,13 +330,8 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
  } else {
    // copy hessian as weight
    CHECK_EQ(d_weight_out.size(), hessian.size());
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
                                  cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
                                  hipMemcpyDefault));
 #endif
  }
  return d_weight_out;
 }
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@ -88,19 +88,10 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 template <std::uint32_t kBlockThreads, typename Kernel>
 std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
  int n_mps = 0;
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
 #endif
  int n_blocks_per_mp = 0;
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                              kBlockThreads, shared_mem));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                              kBlockThreads, shared_mem));
 #endif
  std::uint32_t grid_size = n_blocks_per_mp * n_mps;
  return grid_size;
 }
@ -348,13 +339,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
                                  size_t columns, size_t begin, size_t end,
                                  SketchContainer *sketch_container) {
  dh::XGBCachingDeviceAllocator<char> alloc;
 #if defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device));
 #elif defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device));
 #endif
  info.weights_.SetDevice(device);
  auto weights = info.weights_.ConstDeviceSpan();
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@ -140,17 +140,10 @@ class HostDeviceVectorImpl {
      SetDevice();
      CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                    ptr,
                                    other->Size() * sizeof(T),
                                    cudaMemcpyDeviceToDevice));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipMemcpyAsync(this->DevicePointer() + ori_size,
                                    ptr,
                                    other->Size() * sizeof(T),
                                    hipMemcpyDeviceToDevice));
 #endif
    }
  }
@ -204,17 +197,10 @@ class HostDeviceVectorImpl {
    if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
    SetDevice();
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpy(data_h_.data(),
                             data_d_->data().get(),
                             data_d_->size() * sizeof(T),
                             cudaMemcpyDeviceToHost));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpy(data_h_.data(),
                             data_d_->data().get(),
                             data_d_->size() * sizeof(T),
                             hipMemcpyDeviceToHost));
 #endif
  }
  void LazySyncDevice(GPUAccess access) {
@ -228,17 +214,10 @@ class HostDeviceVectorImpl {
    LazyResizeDevice(data_h_.size());
    SetDevice();
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
                                  data_h_.data(),
                                  data_d_->size() * sizeof(T),
                                  cudaMemcpyHostToDevice));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(),
                                  data_h_.data(),
                                  data_d_->size() * sizeof(T),
                                  hipMemcpyHostToDevice));
 #endif
    gpu_access_ = access;
  }
@ -264,13 +243,8 @@ class HostDeviceVectorImpl {
      gpu_access_ = GPUAccess::kWrite;
      SetDevice();
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
                                    data_d_->size() * sizeof(T), cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
                                    data_d_->size() * sizeof(T), hipMemcpyDefault));
 #endif
    }
  }
@ -279,13 +253,8 @@ class HostDeviceVectorImpl {
    gpu_access_ = GPUAccess::kWrite;
    SetDevice();
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
                                  data_d_->size() * sizeof(T), cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), begin,
                                  data_d_->size() * sizeof(T), hipMemcpyDefault));
 #endif
  }
  void LazyResizeDevice(size_t new_size) {
@ -297,11 +266,7 @@ class HostDeviceVectorImpl {
  void SetDevice() {
    CHECK_GE(device_, 0);
    if (cudaSetDeviceHandler == nullptr) {
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipSetDevice(device_));
 #endif
    } else {
      (*cudaSetDeviceHandler)(device_);
    }
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@ -12,17 +12,9 @@
 namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
 #if defined(XGBOOST_USE_CUDA)
 void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
 #elif defined(XGBOOST_USE_HIP)
 void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
 #endif
 {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(t.Device().ordinal));
 #endif
  static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
                "For function with return, use transform instead.");
  if (t.Contiguous()) {
@ -37,11 +29,7 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s
 }
 template <typename T, int32_t D, typename Fn>
 #if defined(XGBOOST_USE_HIP)
 void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
 #elif defined(XGBOOST_USE_CUDA)
 void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
 #endif
 {
  if (t.Contiguous()) {
    auto ptr = t.Values().data();
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@ -110,15 +110,9 @@ void CopyTo(Span<T> out, Span<U> src) {
  CHECK_EQ(out.size(), src.size());
  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                out.size_bytes(),
                                cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpyAsync(out.data(), src.data(),
                                out.size_bytes(),
                                hipMemcpyDefault));
 #endif
 }
 // Compute the merge path.
@ -251,11 +245,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
               Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
               Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device));
 #endif
  CHECK_EQ(d_x.size() + d_y.size(), out.size());
  CHECK_EQ(x_ptr.size(), out_ptr.size());
@ -354,11 +344,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
 void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                           common::Span<OffsetT> cuts_ptr,
                           size_t total_cuts, Span<float> weights) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  Span<SketchEntry> out;
  dh::device_vector<SketchEntry> cuts;
@ -418,11 +404,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
   * pruning or merging. We preserve the first type and remove the second type.
   */
  timer_.Start(__func__);
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
  dh::XGBCachingDeviceAllocator<char> alloc;
@ -479,11 +461,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
 void SketchContainer::Prune(size_t to) {
  timer_.Start(__func__);
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  OffsetT to_total = 0;
  auto& h_columns_ptr = columns_ptr_b_.HostVector();
@ -518,11 +496,7 @@ void SketchContainer::Prune(size_t to) {
 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                            Span<SketchEntry const> that) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  timer_.Start(__func__);
  if (this->Current().size() == 0) {
@ -558,11 +532,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 }
 void SketchContainer::FixError() {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
  auto in = dh::ToSpan(this->Current());
@ -588,11 +558,7 @@ void SketchContainer::FixError() {
 }
 void SketchContainer::AllReduce(bool is_column_split) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  auto world = collective::GetWorldSize();
  if (world == 1 || is_column_split) {
    return;
@ -674,11 +640,7 @@ struct InvalidCatOp {
 void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
  timer_.Start(__func__);
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  p_cuts->min_vals_.Resize(num_columns_);
  // Sync between workers.
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@ -176,11 +176,7 @@ class SketchContainer {
  size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
    timer_.Start(__func__);
 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(device_));
 #else
    dh::safe_cuda(cudaSetDevice(device_));
 #endif
    this->columns_ptr_.SetDevice(device_);
    Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@ -147,13 +147,8 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
    auto const& h_group_ptr = info.group_ptr_;
    group_ptr_.Resize(h_group_ptr.size());
    auto d_group_ptr = group_ptr_.DeviceSpan();
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
                                  cudaMemcpyHostToDevice, cuctx->Stream()));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
                                  hipMemcpyHostToDevice, cuctx->Stream()));
 #endif
  }
  auto d_group_ptr = DataGroupPtr(ctx);
--- a/src/common/threading_utils.cuh
+++ b/src/common/threading_utils.cuh
@ -61,13 +61,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
                   out_group_threads_ptr.size());
  size_t total = 0;
 #if defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
      sizeof(total), hipMemcpyDeviceToHost));
 #elif defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
      sizeof(total), cudaMemcpyDeviceToHost));
 #endif
  return total;
 }
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
      // default per-thread stream
    default: {
      dh::CUDAEvent e;
 #if defined(XGBOOST_USE_CUDA)
      e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
 #elif defined(XGBOOST_USE_HIP)
      e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
 #endif
      dh::DefaultStream().Wait(e);
    }
  }
--- a/src/data/data.cu
+++ b/src/data/data.cu
@ -22,19 +22,11 @@ namespace cub = hipcub;
 namespace xgboost {
 namespace {
 auto SetDeviceToPtr(void const* ptr) {
 #if defined(XGBOOST_USE_CUDA)
  cudaPointerAttributes attr;
  dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
  int32_t ptr_device = attr.device;
  dh::safe_cuda(cudaSetDevice(ptr_device));
  return ptr_device;
 #elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */
  hipPointerAttribute_t attr;
  dh::safe_cuda(hipPointerGetAttributes(&attr, ptr));
  int32_t ptr_device = attr.device;
  dh::safe_cuda(hipSetDevice(ptr_device));
  return ptr_device;
 #endif
 }
 template <typename T, int32_t D>
@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
      // set data
      data->Resize(array.n);
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
                                    cudaMemcpyDefault, ctx->Stream()));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
                                    hipMemcpyDefault, ctx->Stream()));
 #endif
    });
    return;
  }
@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
  });
  bool non_dec = true;
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
                           cudaMemcpyDeviceToHost));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool),
                           hipMemcpyDeviceToHost));
 #endif
  CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
  size_t bytes = 0;
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
    device_idx_ = dh::CudaGetPointerDevice(first_column.data);
    CHECK_NE(device_idx_, Context::kCpuId);
 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(device_idx_));
 #elif defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(device_idx_));
 #endif
    for (auto& json_col : json_columns) {
      auto column = ArrayInterface<1>(get<Object const>(json_col));
@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 template <typename AdapterBatchT>
 std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
                         float missing) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_idx));
 #endif
  IsValidFunctor is_valid(missing);
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
 #endif
  auto n_samples = batch.NumRows();
  bst_feature_t n_features = batch.NumCols();
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
      n_rows(n_rows) {
  monitor_.Init("ellpack_page");
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device));
 #endif
  monitor_.Start("InitCompressedData");
  InitCompressedData(device);
@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
    : is_dense(dmat->IsDense()) {
  monitor_.Init("ellpack_page");
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
 #endif
  n_rows = dmat->Info().num_row_;
@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
                                 common::Span<size_t> row_counts_span,
                                 common::Span<FeatureType const> feature_types, size_t row_stride,
                                 size_t n_rows, common::HistogramCuts const& cuts) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device));
 #endif
  *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
  CopyDataToEllpack(batch, feature_types, this, device, missing);
@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
  dh::device_vector<size_t> row_ptr(page.row_ptr.size());
  auto d_row_ptr = dh::ToSpan(row_ptr);
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 #endif
  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
  auto null = accessor.NullValue();
@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device,
    if (row_batch.data.DeviceCanRead()) {
      auto const& d_data = row_batch.data.ConstDeviceSpan();
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(
          entries_d.data().get(), d_data.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipMemcpyAsync(
          entries_d.data().get(), d_data.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), hipMemcpyDefault));
 #endif
    } else {
      const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(
          entries_d.data().get(), data_vec.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipMemcpyAsync(
          entries_d.data().get(), data_vec.data() + ent_cnt_begin,
          n_entries * sizeof(Entry), hipMemcpyDefault));
 #endif
    }
    const dim3 block3(32, 8, 1);  // 256 threads
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@ -10,11 +10,7 @@
 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_));
 #endif
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
      // source is initialized to be the 0th page during construction, so when count_ is 0
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  int32_t current_device;
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaGetDevice(&current_device));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipGetDevice(&current_device));
 #endif
  auto get_device = [&]() -> int32_t {
    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    // ctx_.gpu_id = proxy->DeviceIdx();
    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(get_device()));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(get_device()));
 #endif
    if (cols == 0) {
      cols = num_cols();
@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  auto n_features = cols;
  CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(get_device()));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(get_device()));
 #endif
  if (!ref) {
    HostDeviceVector<FeatureType> ft;
@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  while (iter.Next()) {
    init_page();
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(get_device()));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(get_device()));
 #endif
    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                                                                      : adapter->DeviceIdx();
  CHECK_GE(device, 0);
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device));
 #endif
  Context ctx;
  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@ -57,11 +57,7 @@ template <typename AdapterBatchT>
 void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
                     int device_idx, float missing) {
 #if defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_idx));
 #elif defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx));
 #endif
  IsValidFunctor is_valid(missing);
  // Count elements per row
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@ -60,11 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
      return;
    }
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    // The begin and end indices for the section of each column associated with
    // this device
@ -92,17 +88,10 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
      auto col = page[fidx];
      auto seg = column_segments[fidx];
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpy(
          data_.data().get() + row_ptr_[fidx],
          col.data() + seg.first,
          sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipMemcpy(
          data_.data().get() + row_ptr_[fidx],
          col.data() + seg.first,
          sizeof(Entry) * (seg.second - seg.first), hipMemcpyHostToDevice));
 #endif
    }
  }
@ -182,11 +171,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
  // This needs to be public because of the __device__ lambda.
  GradientPair GetBiasGradient(int group_idx, int num_group) {
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    auto counting = thrust::make_counting_iterator(0ull);
    auto f = [=] __device__(size_t idx) {
@ -211,11 +196,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
  // This needs to be public because of the __device__ lambda.
  GradientPair GetGradient(int group_idx, int num_group, int fidx) {
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
    size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
@ -249,17 +230,10 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
  }
  void UpdateGpair(const std::vector<GradientPair> &host_gpair) {
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(
        gpair_.data().get(),
        host_gpair.data(),
        gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(
        gpair_.data().get(),
        host_gpair.data(),
        gpair_.size() * sizeof(GradientPair), hipMemcpyHostToDevice));
 #endif
  }
  // training parameter
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@ -95,11 +95,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
             Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
  auto labels = info.labels.View(device);
  auto weights = info.weights_.ConstDeviceSpan();
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device.ordinal));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device.ordinal));
 #endif
  CHECK_NE(labels.Size(), 0);
  CHECK_EQ(labels.Size(), predts.size());
@ -352,11 +348,7 @@ template <bool scale, typename Fn>
 double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
                           common::Span<uint32_t> d_class_ptr, size_t n_classes,
                           std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device.ordinal));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device.ordinal));
 #endif
  /**
   * Sorted idx
   */
@ -934,11 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
                                                 common::Span<float const> predts,
                                                 MetaInfo const &info,
                                                 std::shared_ptr<DeviceAUCCache> *p_cache) {
 #if defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
 #elif defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
 #endif
  if (predts.empty()) {
    return std::make_pair(0.0, static_cast<uint32_t>(0));
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@ -166,12 +166,7 @@ class MultiClassMetricsReduction {
      labels.SetDevice(device_);
      weights.SetDevice(device_);
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaSetDevice(device_));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipSetDevice(device_));
 #endif
      result = DeviceReduceMetrics(weights, labels, preds, n_class);
    }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@ -159,11 +159,7 @@ class ElementWiseSurvivalMetricsReduction {
      labels_upper_bound.SetDevice(ctx.gpu_id);
      weights.SetDevice(ctx.gpu_id);
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipSetDevice(ctx.gpu_id));
 #endif
      result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
    }
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@ -30,22 +30,13 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                          dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
                          HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
  // copy position to buffer
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
 #endif
  auto cuctx = ctx->CUDACtx();
  size_t n_samples = position.size();
  dh::device_vector<bst_node_t> sorted_position(position.size());
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
                                position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpyAsync(sorted_position.data().get(), position.data(),
                                position.size_bytes(), hipMemcpyDeviceToDevice, cuctx->Stream()));
 #endif
  p_ridx->resize(position.size());
  dh::Iota(dh::ToSpan(*p_ridx));
@ -98,17 +89,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
  bst_node_t* h_first_unique =
      reinterpret_cast<bst_node_t*>(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data());
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
                                cudaMemcpyDeviceToHost, copy_stream.View()));
  dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
                                cudaMemcpyDeviceToHost, copy_stream.View()));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
                                hipMemcpyDeviceToHost, copy_stream.View()));
  dh::safe_cuda(hipMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
                                hipMemcpyDeviceToHost, copy_stream.View()));
 #endif
  /**
   * copy node index (leaf index)
@ -171,11 +155,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                          HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
 #endif
  dh::device_vector<size_t> ridx;
  HostDeviceVector<size_t> nptr;
  HostDeviceVector<bst_node_t> nidx;
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@ -297,11 +297,7 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
            linalg::Matrix<GradientPair>* out_gpair) {
  // boilerplate
  std::int32_t device_id = ctx->gpu_id;
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_id));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_id));
 #endif
  auto n_groups = p_cache->Groups();
  info.labels.SetDevice(device_id);
@ -385,11 +381,7 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
                               linalg::Matrix<GradientPair>* out_gpair) {
  // boilerplate
  auto device = ctx->Device();
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device.ordinal));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device.ordinal));
 #endif
  auto const d_inv_IDCG = p_cache->InvIDCG(ctx);
  auto const discount = p_cache->Discount(ctx);
@ -457,11 +449,7 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
                              linalg::VectorView<double> li, linalg::VectorView<double> lj,
                              linalg::Matrix<GradientPair>* out_gpair) {
  auto device = ctx->Device();
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device.ordinal));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device.ordinal));
 #endif
  info.labels.SetDevice(device);
  predt.SetDevice(device);
@ -500,11 +488,7 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
                                   linalg::VectorView<double> li, linalg::VectorView<double> lj,
                                   linalg::Matrix<GradientPair>* out_gpair) {
  auto device = ctx->Device();
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device.ordinal));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device.ordinal));
 #endif
  info.labels.SetDevice(device);
  predt.SetDevice(device);
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@ -341,11 +341,7 @@ class DeviceModel {
  int num_group;
  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(gpu_id));
 #endif
    // Copy decision trees to device
    tree_segments = HostDeviceVector<size_t>({}, gpu_id);
@ -366,21 +362,12 @@ class DeviceModel {
      auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
      auto& src_stats = model.trees.at(tree_idx)->GetStats();
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaMemcpyAsync(
          d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
          sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault));
      dh::safe_cuda(cudaMemcpyAsync(
          d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
          sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipMemcpyAsync(
          d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
          sizeof(RegTree::Node) * src_nodes.size(), hipMemcpyDefault));
      dh::safe_cuda(hipMemcpyAsync(
          d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
          sizeof(RTreeNodeStat) * src_stats.size(), hipMemcpyDefault));
 #endif
    }
    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
@ -504,11 +491,7 @@ void ExtractPaths(
    dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
    DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
    int gpu_id) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(gpu_id));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(gpu_id));
 #endif
  auto& device_model = *model;
  dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@ -584,15 +567,9 @@ void ExtractPaths(
        thrust::max_element(thrust::device, max_elem_it,
                            max_elem_it + d_cat_node_segments.size()) -
        max_elem_it;
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
                             d_cat_node_segments.data() + max_cat_it,
                             h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpy(h_max_cat.data(),
                             d_cat_node_segments.data() + max_cat_it,
                             h_max_cat.size_bytes(), hipMemcpyDeviceToHost));
 #endif
    max_cat = h_max_cat[0].size;
    CHECK_GE(max_cat, 1);
    path_categories->resize(max_cat * paths->size());
@ -786,11 +763,7 @@ class ColumnSplitHelper {
  void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
                      bst_feature_t num_features, std::uint32_t num_group) const {
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    dh::caching_device_vector<BitType> decision_storage{};
    dh::caching_device_vector<BitType> missing_storage{};
@ -970,11 +943,7 @@ class GPUPredictor : public xgboost::Predictor {
  ~GPUPredictor() override {
    if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    }
  }
@ -1071,11 +1040,7 @@ class GPUPredictor : public xgboost::Predictor {
      LOG(FATAL) << "Dart booster feature " << not_implemented;
    }
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    out_contribs->SetDevice(ctx_->gpu_id);
    if (tree_end == 0 || tree_end > model.trees.size()) {
@ -1135,11 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
      LOG(FATAL) << "Dart booster feature " << not_implemented;
    }
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    out_contribs->SetDevice(ctx_->gpu_id);
    if (tree_end == 0 || tree_end > model.trees.size()) {
@ -1199,11 +1160,7 @@ class GPUPredictor : public xgboost::Predictor {
  void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
                   const gbm::GBTreeModel &model,
                   unsigned tree_end) const override {
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
    const MetaInfo& info = p_fmat->Info();
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@ -427,15 +427,9 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
  for (auto idx : nidx) {
    copy_stream_.View().Wait(event);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(
        h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
        d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(
        h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
        d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
 #endif
  }
 }
@ -516,13 +510,8 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
                       dh::ToSpan(out_entries));
  GPUExpandEntry root_entry;
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                cudaMemcpyDeviceToHost));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                hipMemcpyDeviceToHost));
 #endif
  return root_entry;
 }
 }  // namespace xgboost::tree
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@ -59,13 +59,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
    split_cats_.resize(node_categorical_storage_size_);
    h_split_cats_.resize(node_categorical_storage_size_);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(
        cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(
        hipMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
 #endif
    cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
    sort_input_.resize(cat_sorted_idx_.size());
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@ -266,11 +266,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
  // decide whether to use shared memory
  int device = 0;
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaGetDevice(&device));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipGetDevice(&device));
 #endif
  // opt into maximum shared memory for the kernel if necessary
 #if defined(XGBOOST_USE_CUDA)
@ -303,17 +299,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
    int num_groups = feature_groups.NumGroups();
    int n_mps = 0;
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
    int n_blocks_per_mp = 0;
    dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                                kBlockThreads, smem_size));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
    int n_blocks_per_mp = 0;
    dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                                kBlockThreads, smem_size));
 #endif
    // This gives the number of blocks to keep the device occupied
    // Use this as the maximum number of blocks
@ -347,11 +336,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
    runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
  }
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaGetLastError());
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipGetLastError());
 #endif
 }
 }  // namespace tree
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@ -16,22 +16,14 @@ namespace tree {
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_idx_));
 #endif
  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
 }
 RowPartitioner::~RowPartitioner() {
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaSetDevice(device_idx_));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipSetDevice(device_idx_));
 #endif
 }
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@ -287,15 +287,9 @@ class RowPartitioner {
      total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
    }
 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
                                  hipMemcpyDefault));
 #else
    dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
                                  cudaMemcpyDefault));
 #endif
    // Temporary arrays
    auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
@ -305,13 +299,8 @@ class RowPartitioner {
    SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
        dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
        total_rows, op, &tmp_);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
                                  cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
                                  hipMemcpyDefault));
 #endif
    // TODO(Rory): this synchronisation hurts performance a lot
    // Future optimisation should find a way to skip this
    dh::DefaultStream().Sync();
@ -348,15 +337,9 @@ class RowPartitioner {
  void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
    dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
 #if defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                  sizeof(NodePositionInfo) * ridx_segments_.size(),
                                  hipMemcpyDefault));
 #else
    dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                  sizeof(NodePositionInfo) * ridx_segments_.size(),
                                  cudaMemcpyDefault));
 #endif
    constexpr int kBlockSize = 512;
    const int kItemsThread = 8;
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@ -232,26 +232,16 @@ struct GPUHistMakerDevice {
    this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
                                param.colsample_bynode, param.colsample_bylevel,
                                param.colsample_bytree);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    this->interaction_constraints.Reset();
    if (d_gpair.size() != dh_gpair->Size()) {
      d_gpair.resize(dh_gpair->Size());
    }
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
                                  dh_gpair->Size() * sizeof(GradientPair),
                                  cudaMemcpyDeviceToDevice));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
                                  dh_gpair->Size() * sizeof(GradientPair),
                                  hipMemcpyDeviceToDevice));
 #endif
    auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
    page = sample.page;
    gpair = sample.gpair;
@ -338,28 +328,15 @@ struct GPUHistMakerDevice {
      max_active_features =
          std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
    }
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(
        d_node_inputs.data().get(), h_node_inputs.data(),
        h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(
        d_node_inputs.data().get(), h_node_inputs.data(),
        h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
 #endif
    this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
                                    shared_inputs, dh::ToSpan(entries));
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
                                  cudaMemcpyDeviceToHost));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(pinned_candidates_out.data(),
                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
                                  hipMemcpyDeviceToHost));
 #endif
    dh::DefaultStream().Sync();
  }
@ -412,13 +389,8 @@ struct GPUHistMakerDevice {
    BitVector missing_bits{dh::ToSpan(missing_storage)};
    dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
                                  num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
                                  num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
 #endif
    auto d_split_data = dh::ToSpan(split_data_storage);
    dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
@ -527,15 +499,9 @@ struct GPUHistMakerDevice {
    dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
                                  d_nodes.size() * sizeof(RegTree::Node),
                                  cudaMemcpyHostToDevice));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
                                  d_nodes.size() * sizeof(RegTree::Node),
                                  hipMemcpyHostToDevice));
 #endif
    auto const& h_split_types = p_tree->GetSplitTypes();
    auto const& categories = p_tree->GetSplitCategories();
@ -606,15 +572,9 @@ struct GPUHistMakerDevice {
    auto s_position = p_out_position->ConstDeviceSpan();
    positions.resize(s_position.size());
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
                                  s_position.size_bytes(), cudaMemcpyDeviceToDevice,
                                  ctx_->CUDACtx()->Stream()));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(positions.data().get(), s_position.data(),
                                  s_position.size_bytes(), hipMemcpyDeviceToDevice,
                                  ctx_->CUDACtx()->Stream()));
 #endif
    dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
      bst_node_t position = d_out_position[idx];
@ -632,26 +592,16 @@ struct GPUHistMakerDevice {
    CHECK(out_preds_d.Device().IsCUDA());
    CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
 #endif
    auto d_position = dh::ToSpan(positions);
    CHECK_EQ(out_preds_d.Size(), d_position.size());
    auto const& h_nodes = p_tree->GetNodes();
    dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
                                  h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice,
                                  ctx_->CUDACtx()->Stream()));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(nodes.data().get(), h_nodes.data(),
                                  h_nodes.size() * sizeof(RegTree::Node), hipMemcpyHostToDevice,
                                  ctx_->CUDACtx()->Stream()));
 #endif
    auto d_nodes = dh::ToSpan(nodes);
    CHECK_EQ(out_preds_d.Shape(1), 1);
@ -904,11 +854,7 @@ class GPUHistMaker : public TreeUpdater {
        ++t_idx;
      }
 #if defined(XGBOOST_USE_CUDA)
      dh::safe_cuda(cudaGetLastError());
 #elif defined(XGBOOST_USE_HIP)
      dh::safe_cuda(hipGetLastError());
 #endif
    } catch (const std::exception& e) {
      LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
    }
@ -925,11 +871,7 @@ class GPUHistMaker : public TreeUpdater {
    this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
    auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
 #endif
    info_->feature_types.SetDevice(ctx_->gpu_id);
    maker = std::make_unique<GPUHistMakerDevice>(