From ffbbc9c9689343719e62bfb1c197f521d50bcb9e Mon Sep 17 00:00:00 2001
From: Your Name <96135754+amdsc21@users.noreply.github.com>
Date: Tue, 17 Oct 2023 12:42:37 -0700
Subject: [PATCH] add cuda to hip wrapper

---
 src/c_api/c_api.cu                            |  8 ---
 .../device_communicator_adapter.cuh           | 34 -----------
 src/common/algorithm.cuh                      |  5 --
 src/common/common.cu                          | 13 +----
 src/common/cuda_to_hip.h                      | 57 ++++++++++++++++++
 src/common/device_helpers.hip.h               |  2 +
 src/common/hist_util.cu                       |  5 --
 src/common/hist_util.cuh                      | 15 -----
 src/common/host_device_vector.cu              | 35 -----------
 src/common/linalg_op.cuh                      | 12 ----
 src/common/quantile.cu                        | 38 ------------
 src/common/quantile.cuh                       |  4 --
 src/common/ranking_utils.cu                   |  5 --
 src/common/threading_utils.cuh                |  5 --
 src/data/array_interface.cu                   |  4 --
 src/data/data.cu                              | 18 ------
 src/data/device_adapter.cuh                   | 12 ----
 src/data/ellpack_page.cu                      | 29 ----------
 src/data/ellpack_page_source.cu               |  4 --
 src/data/iterative_dmatrix.cu                 | 16 -----
 src/data/simple_dmatrix.cu                    |  4 --
 src/data/simple_dmatrix.cuh                   |  4 --
 src/linear/updater_gpu_coordinate.cu          | 26 ---------
 src/metric/auc.cu                             | 12 ----
 src/metric/multiclass_metric.cu               |  5 --
 src/metric/survival_metric.cu                 |  4 --
 src/objective/adaptive.cu                     | 20 -------
 src/objective/lambdarank_obj.cu               | 16 -----
 src/predictor/gpu_predictor.cu                | 43 --------------
 src/tree/gpu_hist/evaluate_splits.cu          | 11 ----
 src/tree/gpu_hist/evaluator.cu                |  5 --
 src/tree/gpu_hist/histogram.cu                | 15 -----
 src/tree/gpu_hist/row_partitioner.cu          |  8 ---
 src/tree/gpu_hist/row_partitioner.cuh         | 17 ------
 src/tree/updater_gpu_hist.cu                  | 58 -------------------
 35 files changed, 60 insertions(+), 509 deletions(-)
 create mode 100644 src/common/cuda_to_hip.h

diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 506be723b..de21e9749 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -59,21 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) {
 void XGBoostAPIGuard::SetGPUAttribute() {
   // Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
   // If errors, do nothing, assuming running on CPU only machine.
-#if defined(XGBOOST_USE_CUDA)
   cudaGetDevice(&device_id_);
-#elif defined(XGBOOST_USE_HIP)
-  hipGetDevice(&device_id_);
-#endif
 }
 
 void XGBoostAPIGuard::RestoreGPUAttribute() {
   // Not calling `safe_cuda` to avoid unnecessary exception handling overhead.
   // If errors, do nothing, assuming running on CPU only machine.
-#if defined(XGBOOST_USE_CUDA)
   cudaSetDevice(device_id_);
-#elif defined(XGBOOST_USE_HIP)
-  hipSetDevice(device_id_);
-#endif
 }
 
 void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
diff --git a/src/collective/device_communicator_adapter.cuh b/src/collective/device_communicator_adapter.cuh
index 49c0405cb..0ffa28770 100644
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -26,22 +26,12 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
       return;
     }
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(device_ordinal_));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device_ordinal_));
-#endif
     auto size = count * GetTypeSize(data_type);
     host_buffer_.resize(size);
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
     Allreduce(host_buffer_.data(), count, data_type, op);
     dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpy(host_buffer_.data(), send_receive_buffer, size, hipMemcpyDefault));
-    AllReduce(host_buffer_.data(), count, data_type, op);
-    dh::safe_cuda(hipMemcpy(send_receive_buffer, host_buffer_.data(), size, hipMemcpyDefault));
-#endif
   }
 
   void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
@@ -49,7 +39,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
       return;
     }
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(device_ordinal_));
     host_buffer_.resize(send_size * world_size_);
     dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
@@ -57,15 +46,6 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
     Allgather(host_buffer_.data(), host_buffer_.size());
     dh::safe_cuda(
         cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device_ordinal_));
-    host_buffer_.resize(send_size * world_size_);
-    dh::safe_cuda(hipMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
-                             hipMemcpyDefault));
-    Allgather(host_buffer_.data(), host_buffer_.size());
-    dh::safe_cuda(
-        hipMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), hipMemcpyDefault));
-#endif
   }
 
   void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
@@ -74,11 +54,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
       return;
     }
 
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device_ordinal_));
-#elif defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(device_ordinal_));
-#endif
 
     segments->clear();
     segments->resize(world_size_, 0);
@@ -92,25 +68,15 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
     for (int32_t i = 0; i < world_size_; ++i) {
       size_t as_bytes = segments->at(i);
       if (i == rank_) {
-#if defined(XGBOOST_USE_CUDA)
         dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
                                  cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-        dh::safe_cuda(hipMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
-                                 hipMemcpyDefault));
-#endif
       }
       Broadcast(host_buffer_.data() + offset, as_bytes, i);
       offset += as_bytes;
     }
 
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
-                             hipMemcpyDefault));
-#elif defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
                              cudaMemcpyDefault));
-#endif
   }
 
   void Synchronize() override {
diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index 20192a7f2..8bf6bb808 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -185,13 +185,8 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
       sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
       group_ptr.data() + 1, ctx->CUDACtx()->Stream());
 
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
-                                sorted_idx.size_bytes(), hipMemcpyDeviceToDevice));
-#elif defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
                                 sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
-#endif
 }
 
 /**
diff --git a/src/common/common.cu b/src/common/common.cu
index 0997b7c83..b57890906 100644
--- a/src/common/common.cu
+++ b/src/common/common.cu
@@ -2,17 +2,14 @@
  * Copyright 2018-2022 XGBoost contributors
  */
 #include "common.h"
+#include "cuda_to_hip.h"
 
 namespace xgboost {
 namespace common {
 
 void SetDevice(std::int32_t device) {
   if (device >= 0) {
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device));
-#endif
   }
 }
 
@@ -21,17 +18,9 @@ int AllVisibleGPUs() {
   try {
     // When compiled with CUDA but running on CPU only device,
     // cudaGetDeviceCount will fail.
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipGetDeviceCount(&n_visgpus));
-#endif
   } catch (const dmlc::Error &) {
-#if defined(XGBOOST_USE_CUDA)
     cudaGetLastError();  // reset error.
-#elif defined(XGBOOST_USE_HIP)
-    hipGetLastError();  // reset error.
-#endif
     return 0;
   }
   return n_visgpus;
diff --git a/src/common/cuda_to_hip.h b/src/common/cuda_to_hip.h
new file mode 100644
index 000000000..6033a80b2
--- /dev/null
+++ b/src/common/cuda_to_hip.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2017-2023 XGBoost contributors
+ */
+#pragma once
+
+#if defined(XGBOOST_USE_HIP)
+
+#define cudaSuccess                  hipSuccess
+#define cudaGetLastError             hipGetLastError
+
+#define cudaStream_t                 hipStream_t
+#define cudaStreamCreate             hipStreamCreate
+#define cudaStreamCreateWithFlags    hipStreamCreateWithFlags
+#define cudaStreamDestroy            hipStreamDestroy
+#define cudaStreamWaitEvent          hipStreamWaitEvent
+#define cudaStreamSynchronize        hipStreamSynchronize
+#define cudaStreamPerThread          hipStreamPerThread
+#define cudaStreamLegacy             hipStreamLegacy
+
+#define cudaEvent_t                  hipEvent_t
+#define cudaEventCreate              hipEventCreate
+#define cudaEventCreateWithFlags     hipEventCreateWithFlags
+#define cudaEventDestroy             hipEventDestroy
+
+#define cudaGetDevice                hipGetDevice
+#define cudaSetDevice                hipSetDevice
+#define cudaGetDeviceCount           hipGetDeviceCount
+#define cudaDeviceSynchronize        hipDeviceSynchronize
+
+#define cudaGetDeviceProperties      hipGetDeviceProperties
+#define cudaDeviceGetAttribute       hipDeviceGetAttribute
+
+#define cudaMallocHost               hipMallocHost
+#define cudaFreeHost                 hipFreeHost
+#define cudaMalloc                   hipMalloc
+#define cudaFree                     hipFree
+
+#define cudaMemcpy                   hipMemcpy
+#define cudaMemcpyAsync              hipMemcpyAsync
+#define cudaMemcpyDefault            hipMemcpyDefault
+#define cudaMemcpyHostToDevice       hipMemcpyHostToDevice
+#define cudaMemcpyHostToHost         hipMemcpyHostToHost
+#define cudaMemcpyDeviceToHost       hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice     hipMemcpyDeviceToDevice
+#define cudaMemsetAsync              hipMemsetAsync
+#define cudaMemset                   hipMemset
+
+#define cudaPointerAttributes        hipPointerAttribute_t 
+#define cudaPointerGetAttributes     hipPointerGetAttributes
+
+#define cudaMemGetInfo               hipMemGetInfo
+#define cudaFuncSetAttribute         hipFuncSetAttribute
+
+#define cudaDevAttrMultiProcessorCount                hipDeviceAttributeMultiprocessorCount
+#define cudaOccupancyMaxActiveBlocksPerMultiprocessor hipOccupancyMaxActiveBlocksPerMultiprocessor
+
+#endif
diff --git a/src/common/device_helpers.hip.h b/src/common/device_helpers.hip.h
index e7ee49b5a..2852155d4 100644
--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
@@ -31,6 +31,8 @@
 #include <tuple>
 #include <vector>
 
+#include "cuda_to_hip.h"
+
 #include "../collective/communicator-inl.h"
 #include "common.h"
 #include "xgboost/global_config.h"
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 14b60df33..f727384de 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -330,13 +330,8 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
   } else {
     // copy hessian as weight
     CHECK_EQ(d_weight_out.size(), hessian.size());
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
                                   cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
-                                  hipMemcpyDefault));
-#endif
   }
   return d_weight_out;
 }
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index bc99e6fc4..f86685eda 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -88,19 +88,10 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 template <std::uint32_t kBlockThreads, typename Kernel>
 std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
   int n_mps = 0;
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
-#endif
   int n_blocks_per_mp = 0;
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                               kBlockThreads, shared_mem));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
-                                                              kBlockThreads, shared_mem));
-#endif
   std::uint32_t grid_size = n_blocks_per_mp * n_mps;
   return grid_size;
 }
@@ -348,13 +339,7 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
                                   size_t columns, size_t begin, size_t end,
                                   SketchContainer *sketch_container) {
   dh::XGBCachingDeviceAllocator<char> alloc;
-
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#elif defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device));
-#endif
-
   info.weights_.SetDevice(device);
   auto weights = info.weights_.ConstDeviceSpan();
 
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 70e5c448a..a9102f668 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -140,17 +140,10 @@ class HostDeviceVectorImpl {
       SetDevice();
       CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                     ptr,
                                     other->Size() * sizeof(T),
                                     cudaMemcpyDeviceToDevice));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(this->DevicePointer() + ori_size,
-                                    ptr,
-                                    other->Size() * sizeof(T),
-                                    hipMemcpyDeviceToDevice));
-#endif
     }
   }
 
@@ -204,17 +197,10 @@ class HostDeviceVectorImpl {
     if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
     SetDevice();
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpy(data_h_.data(),
                              data_d_->data().get(),
                              data_d_->size() * sizeof(T),
                              cudaMemcpyDeviceToHost));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpy(data_h_.data(),
-                             data_d_->data().get(),
-                             data_d_->size() * sizeof(T),
-                             hipMemcpyDeviceToHost));
-#endif
   }
 
   void LazySyncDevice(GPUAccess access) {
@@ -228,17 +214,10 @@ class HostDeviceVectorImpl {
     LazyResizeDevice(data_h_.size());
     SetDevice();
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(),
                                   data_h_.data(),
                                   data_d_->size() * sizeof(T),
                                   cudaMemcpyHostToDevice));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(),
-                                  data_h_.data(),
-                                  data_d_->size() * sizeof(T),
-                                  hipMemcpyHostToDevice));
-#endif
     gpu_access_ = access;
   }
 
@@ -264,13 +243,8 @@ class HostDeviceVectorImpl {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
                                     data_d_->size() * sizeof(T), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
-                                    data_d_->size() * sizeof(T), hipMemcpyDefault));
-#endif
     }
   }
 
@@ -279,13 +253,8 @@ class HostDeviceVectorImpl {
     gpu_access_ = GPUAccess::kWrite;
     SetDevice();
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
                                   data_d_->size() * sizeof(T), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(data_d_->data().get(), begin,
-                                  data_d_->size() * sizeof(T), hipMemcpyDefault));
-#endif
   }
 
   void LazyResizeDevice(size_t new_size) {
@@ -297,11 +266,7 @@ class HostDeviceVectorImpl {
   void SetDevice() {
     CHECK_GE(device_, 0);
     if (cudaSetDeviceHandler == nullptr) {
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipSetDevice(device_));
-#endif
     } else {
       (*cudaSetDeviceHandler)(device_);
     }
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 1d97f9b21..1f68c6ce7 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -12,17 +12,9 @@
 namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
-#if defined(XGBOOST_USE_CUDA)
 void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
-#elif defined(XGBOOST_USE_HIP)
-void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
-#endif
 {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(t.Device().ordinal));
-#endif
   static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
                 "For function with return, use transform instead.");
   if (t.Contiguous()) {
@@ -37,11 +29,7 @@ void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s
 }
 
 template <typename T, int32_t D, typename Fn>
-#if defined(XGBOOST_USE_HIP)
-void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, hipStream_t s = nullptr)
-#elif defined(XGBOOST_USE_CUDA)
 void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
-#endif
 {
   if (t.Contiguous()) {
     auto ptr = t.Values().data();
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 881275298..9896165ad 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -110,15 +110,9 @@ void CopyTo(Span<T> out, Span<U> src) {
   CHECK_EQ(out.size(), src.size());
   static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                 out.size_bytes(),
                                 cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpyAsync(out.data(), src.data(),
-                                out.size_bytes(),
-                                hipMemcpyDefault));
-#endif
 }
 
 // Compute the merge path.
@@ -251,11 +245,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
                Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
                Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif
 
   CHECK_EQ(d_x.size() + d_y.size(), out.size());
   CHECK_EQ(x_ptr.size(), out_ptr.size());
@@ -354,11 +344,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
 void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                            common::Span<OffsetT> cuts_ptr,
                            size_t total_cuts, Span<float> weights) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
 
   Span<SketchEntry> out;
   dh::device_vector<SketchEntry> cuts;
@@ -418,11 +404,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
    * pruning or merging. We preserve the first type and remove the second type.
    */
   timer_.Start(__func__);
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
   CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
   dh::XGBCachingDeviceAllocator<char> alloc;
 
@@ -479,11 +461,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
 
 void SketchContainer::Prune(size_t to) {
   timer_.Start(__func__);
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
 
   OffsetT to_total = 0;
   auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -518,11 +496,7 @@ void SketchContainer::Prune(size_t to) {
 
 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                             Span<SketchEntry const> that) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
 
   timer_.Start(__func__);
   if (this->Current().size() == 0) {
@@ -558,11 +532,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 }
 
 void SketchContainer::FixError() {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
 
   auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
   auto in = dh::ToSpan(this->Current());
@@ -588,11 +558,7 @@ void SketchContainer::FixError() {
 }
 
 void SketchContainer::AllReduce(bool is_column_split) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
     return;
@@ -674,11 +640,7 @@ struct InvalidCatOp {
 
 void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 79db5d857..221706274 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -176,11 +176,7 @@ class SketchContainer {
   size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
     timer_.Start(__func__);
 
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device_));
-#else
     dh::safe_cuda(cudaSetDevice(device_));
-#endif
 
     this->columns_ptr_.SetDevice(device_);
     Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
index 39aee4073..e9347aa82 100644
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -147,13 +147,8 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
     auto const& h_group_ptr = info.group_ptr_;
     group_ptr_.Resize(h_group_ptr.size());
     auto d_group_ptr = group_ptr_.DeviceSpan();
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
                                   cudaMemcpyHostToDevice, cuctx->Stream()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
-                                  hipMemcpyHostToDevice, cuctx->Stream()));
-#endif
   }
 
   auto d_group_ptr = DataGroupPtr(ctx);
diff --git a/src/common/threading_utils.cuh b/src/common/threading_utils.cuh
index 23fda9256..77cf709d3 100644
--- a/src/common/threading_utils.cuh
+++ b/src/common/threading_utils.cuh
@@ -61,13 +61,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
                    out_group_threads_ptr.size());
   size_t total = 0;
 
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
-      sizeof(total), hipMemcpyDeviceToHost));
-#elif defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
       sizeof(total), cudaMemcpyDeviceToHost));
-#endif
 
   return total;
 }
diff --git a/src/data/array_interface.cu b/src/data/array_interface.cu
index cf4117656..492c24200 100644
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@@ -28,11 +28,7 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
       // default per-thread stream
     default: {
       dh::CUDAEvent e;
-#if defined(XGBOOST_USE_CUDA)
       e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
-#elif defined(XGBOOST_USE_HIP)
-      e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
-#endif
       dh::DefaultStream().Wait(e);
     }
   }
diff --git a/src/data/data.cu b/src/data/data.cu
index 3fe44ee12..b1b75f5e6 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -22,19 +22,11 @@ namespace cub = hipcub;
 namespace xgboost {
 namespace {
 auto SetDeviceToPtr(void const* ptr) {
-#if defined(XGBOOST_USE_CUDA)
   cudaPointerAttributes attr;
   dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
   int32_t ptr_device = attr.device;
   dh::safe_cuda(cudaSetDevice(ptr_device));
   return ptr_device;
-#elif defined(XGBOOST_USE_HIP) /* this is wrong, need to figure out */
-  hipPointerAttribute_t attr;
-  dh::safe_cuda(hipPointerGetAttributes(&attr, ptr));
-  int32_t ptr_device = attr.device;
-  dh::safe_cuda(hipSetDevice(ptr_device));
-  return ptr_device;
-#endif
 }
 
 template <typename T, int32_t D>
@@ -57,13 +49,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
       // set data
       data->Resize(array.n);
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
                                     cudaMemcpyDefault, ctx->Stream()));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
-                                    hipMemcpyDefault, ctx->Stream()));
-#endif
     });
     return;
   }
@@ -114,13 +101,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
   });
   bool non_dec = true;
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpy(&non_dec, flag.data().get(), sizeof(bool),
                            cudaMemcpyDeviceToHost));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpy(&non_dec, flag.data().get(), sizeof(bool),
-                           hipMemcpyDeviceToHost));
-#endif
 
   CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
   size_t bytes = 0;
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 361d808ad..7b907f7e2 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -123,11 +123,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
     device_idx_ = dh::CudaGetPointerDevice(first_column.data);
     CHECK_NE(device_idx_, Context::kCpuId);
 
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(device_idx_));
-#elif defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(device_idx_));
-#endif
 
     for (auto& json_col : json_columns) {
       auto column = ArrayInterface<1>(get<Object const>(json_col));
@@ -216,18 +212,10 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 template <typename AdapterBatchT>
 std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
                          float missing) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_idx));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_idx));
-#endif
 
   IsValidFunctor is_valid(missing);
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
-#endif
 
   auto n_samples = batch.NumRows();
   bst_feature_t n_features = batch.NumCols();
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index da6b52cc4..58b96b665 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -107,11 +107,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
       n_rows(n_rows) {
   monitor_.Init("ellpack_page");
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif
 
   monitor_.Start("InitCompressedData");
   InitCompressedData(device);
@@ -132,11 +128,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : is_dense(dmat->IsDense()) {
   monitor_.Init("ellpack_page");
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
-#endif
 
   n_rows = dmat->Info().num_row_;
 
@@ -330,11 +322,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
                                  common::Span<size_t> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  size_t n_rows, common::HistogramCuts const& cuts) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif
 
   *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
   CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -409,13 +397,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
   dh::device_vector<size_t> row_ptr(page.row_ptr.size());
   auto d_row_ptr = dh::ToSpan(row_ptr);
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                 cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
-                                hipMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
-#endif
 
   auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
   auto null = accessor.NullValue();
@@ -570,27 +553,15 @@ void EllpackPageImpl::CreateHistIndices(int device,
     if (row_batch.data.DeviceCanRead()) {
       auto const& d_data = row_batch.data.ConstDeviceSpan();
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpyAsync(
           entries_d.data().get(), d_data.data() + ent_cnt_begin,
           n_entries * sizeof(Entry), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(
-          entries_d.data().get(), d_data.data() + ent_cnt_begin,
-          n_entries * sizeof(Entry), hipMemcpyDefault));
-#endif
     } else {
       const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpyAsync(
           entries_d.data().get(), data_vec.data() + ent_cnt_begin,
           n_entries * sizeof(Entry), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(
-          entries_d.data().get(), data_vec.data() + ent_cnt_begin,
-          n_entries * sizeof(Entry), hipMemcpyDefault));
-#endif
     }
 
     const dim3 block3(32, 8, 1);  // 256 threads
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 2247d281e..abfc400c1 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,11 +10,7 @@
 
 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_));
-#endif
   if (!this->ReadCache()) {
     if (count_ != 0 && !sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index a878ff115..4825b58e7 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -47,11 +47,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 
   int32_t current_device;
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaGetDevice(&current_device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipGetDevice(&current_device));
-#endif
 
   auto get_device = [&]() -> int32_t {
     std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
@@ -68,11 +64,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     // ctx_.gpu_id = proxy->DeviceIdx();
     CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(get_device()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(get_device()));
-#endif
 
     if (cols == 0) {
       cols = num_cols();
@@ -111,11 +103,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   auto n_features = cols;
   CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(get_device()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(get_device()));
-#endif
 
   if (!ref) {
     HostDeviceVector<FeatureType> ft;
@@ -156,11 +144,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   while (iter.Next()) {
     init_page();
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(get_device()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(get_device()));
-#endif
 
     auto rows = num_rows();
     dh::device_vector<size_t> row_counts(rows + 1, 0);
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index fe81a0f4d..39d701b43 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -25,11 +25,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                                                                       : adapter->DeviceIdx();
   CHECK_GE(device, 0);
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif
 
   Context ctx;
   ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index 6b25afd45..a26899ff1 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -57,11 +57,7 @@ template <typename AdapterBatchT>
 void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
                      int device_idx, float missing) {
 
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_idx));
-#elif defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_idx));
-#endif
 
   IsValidFunctor is_valid(missing);
   // Count elements per row
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index 51c144f11..1c1ae1ba4 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -60,11 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
       return;
     }
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
 
     // The begin and end indices for the section of each column associated with
     // this device
@@ -92,17 +88,10 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
       auto col = page[fidx];
       auto seg = column_segments[fidx];
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpy(
           data_.data().get() + row_ptr_[fidx],
           col.data() + seg.first,
           sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpy(
-          data_.data().get() + row_ptr_[fidx],
-          col.data() + seg.first,
-          sizeof(Entry) * (seg.second - seg.first), hipMemcpyHostToDevice));
-#endif
     }
   }
 
@@ -182,11 +171,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 
   // This needs to be public because of the __device__ lambda.
   GradientPair GetBiasGradient(int group_idx, int num_group) {
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
 
     auto counting = thrust::make_counting_iterator(0ull);
     auto f = [=] __device__(size_t idx) {
@@ -211,11 +196,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 
   // This needs to be public because of the __device__ lambda.
   GradientPair GetGradient(int group_idx, int num_group, int fidx) {
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
 
     common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
     size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
@@ -249,17 +230,10 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
   }
 
   void UpdateGpair(const std::vector<GradientPair> &host_gpair) {
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(
         gpair_.data().get(),
         host_gpair.data(),
         gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(
-        gpair_.data().get(),
-        host_gpair.data(),
-        gpair_.size() * sizeof(GradientPair), hipMemcpyHostToDevice));
-#endif
   }
 
   // training parameter
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 0586f1a03..7f8fa38be 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -95,11 +95,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
              Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
   auto labels = info.labels.View(device);
   auto weights = info.weights_.ConstDeviceSpan();
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device.ordinal));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device.ordinal));
-#endif
 
   CHECK_NE(labels.Size(), 0);
   CHECK_EQ(labels.Size(), predts.size());
@@ -352,11 +348,7 @@ template <bool scale, typename Fn>
 double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
                            common::Span<uint32_t> d_class_ptr, size_t n_classes,
                            std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device.ordinal));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device.ordinal));
-#endif
   /**
    * Sorted idx
    */
@@ -934,11 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
                                                  common::Span<float const> predts,
                                                  MetaInfo const &info,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache) {
-#if defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
-#elif defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
-#endif
 
   if (predts.empty()) {
     return std::make_pair(0.0, static_cast<uint32_t>(0));
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index 6c27f4100..ba236a0be 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -166,12 +166,7 @@ class MultiClassMetricsReduction {
       labels.SetDevice(device_);
       weights.SetDevice(device_);
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaSetDevice(device_));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipSetDevice(device_));
-#endif
-
       result = DeviceReduceMetrics(weights, labels, preds, n_class);
     }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index e4accc436..ef49687f9 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -159,11 +159,7 @@ class ElementWiseSurvivalMetricsReduction {
       labels_upper_bound.SetDevice(ctx.gpu_id);
       weights.SetDevice(ctx.gpu_id);
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipSetDevice(ctx.gpu_id));
-#endif
 
       result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
     }
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 4bbabbf28..4835373ad 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -30,22 +30,13 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                           dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
                           HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
   // copy position to buffer
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
-#endif
   auto cuctx = ctx->CUDACtx();
   size_t n_samples = position.size();
   dh::device_vector<bst_node_t> sorted_position(position.size());
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
                                 position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpyAsync(sorted_position.data().get(), position.data(),
-                                position.size_bytes(), hipMemcpyDeviceToDevice, cuctx->Stream()));
-#endif
 
   p_ridx->resize(position.size());
   dh::Iota(dh::ToSpan(*p_ridx));
@@ -98,17 +89,10 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   bst_node_t* h_first_unique =
       reinterpret_cast<bst_node_t*>(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data());
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
                                 cudaMemcpyDeviceToHost, copy_stream.View()));
   dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
                                 cudaMemcpyDeviceToHost, copy_stream.View()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
-                                hipMemcpyDeviceToHost, copy_stream.View()));
-  dh::safe_cuda(hipMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
-                                hipMemcpyDeviceToHost, copy_stream.View()));
-#endif
 
   /**
    * copy node index (leaf index)
@@ -171,11 +155,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
                           std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->Ordinal()));
-#endif
   dh::device_vector<size_t> ridx;
   HostDeviceVector<size_t> nptr;
   HostDeviceVector<bst_node_t> nidx;
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index 14bd31063..f0a7f1d5e 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -297,11 +297,7 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
             linalg::Matrix<GradientPair>* out_gpair) {
   // boilerplate
   std::int32_t device_id = ctx->gpu_id;
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_id));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_id));
-#endif
   auto n_groups = p_cache->Groups();
 
   info.labels.SetDevice(device_id);
@@ -385,11 +381,7 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
                                linalg::Matrix<GradientPair>* out_gpair) {
   // boilerplate
   auto device = ctx->Device();
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device.ordinal));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device.ordinal));
-#endif
   auto const d_inv_IDCG = p_cache->InvIDCG(ctx);
   auto const discount = p_cache->Discount(ctx);
 
@@ -457,11 +449,7 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
                               linalg::VectorView<double> li, linalg::VectorView<double> lj,
                               linalg::Matrix<GradientPair>* out_gpair) {
   auto device = ctx->Device();
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device.ordinal));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device.ordinal));
-#endif
 
   info.labels.SetDevice(device);
   predt.SetDevice(device);
@@ -500,11 +488,7 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
                                    linalg::VectorView<double> li, linalg::VectorView<double> lj,
                                    linalg::Matrix<GradientPair>* out_gpair) {
   auto device = ctx->Device();
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device.ordinal));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device.ordinal));
-#endif
 
   info.labels.SetDevice(device);
   predt.SetDevice(device);
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index d5c08c22f..b1ab57b98 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -341,11 +341,7 @@ class DeviceModel {
   int num_group;
 
   void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(gpu_id));
-#endif
 
     // Copy decision trees to device
     tree_segments = HostDeviceVector<size_t>({}, gpu_id);
@@ -366,21 +362,12 @@ class DeviceModel {
       auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
       auto& src_stats = model.trees.at(tree_idx)->GetStats();
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaMemcpyAsync(
           d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
           sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault));
       dh::safe_cuda(cudaMemcpyAsync(
           d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
           sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipMemcpyAsync(
-          d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
-          sizeof(RegTree::Node) * src_nodes.size(), hipMemcpyDefault));
-      dh::safe_cuda(hipMemcpyAsync(
-          d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
-          sizeof(RTreeNodeStat) * src_stats.size(), hipMemcpyDefault));
-#endif
     }
 
     tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
@@ -504,11 +491,7 @@ void ExtractPaths(
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
     DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
     int gpu_id) {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(gpu_id));
-#endif
   auto& device_model = *model;
 
   dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -584,15 +567,9 @@ void ExtractPaths(
         thrust::max_element(thrust::device, max_elem_it,
                             max_elem_it + d_cat_node_segments.size()) -
         max_elem_it;
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
                              d_cat_node_segments.data() + max_cat_it,
                              h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpy(h_max_cat.data(),
-                             d_cat_node_segments.data() + max_cat_it,
-                             h_max_cat.size_bytes(), hipMemcpyDeviceToHost));
-#endif
     max_cat = h_max_cat[0].size;
     CHECK_GE(max_cat, 1);
     path_categories->resize(max_cat * paths->size());
@@ -786,11 +763,7 @@ class ColumnSplitHelper {
 
   void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
                       bst_feature_t num_features, std::uint32_t num_group) const {
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
     dh::caching_device_vector<BitType> decision_storage{};
     dh::caching_device_vector<BitType> missing_storage{};
 
@@ -970,11 +943,7 @@ class GPUPredictor : public xgboost::Predictor {
 
   ~GPUPredictor() override {
     if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
     }
   }
 
@@ -1071,11 +1040,7 @@ class GPUPredictor : public xgboost::Predictor {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
 
     out_contribs->SetDevice(ctx_->gpu_id);
     if (tree_end == 0 || tree_end > model.trees.size()) {
@@ -1135,11 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
 
     out_contribs->SetDevice(ctx_->gpu_id);
     if (tree_end == 0 || tree_end > model.trees.size()) {
@@ -1199,11 +1160,7 @@ class GPUPredictor : public xgboost::Predictor {
   void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
                    const gbm::GBTreeModel &model,
                    unsigned tree_end) const override {
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
     auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
 
     const MetaInfo& info = p_fmat->Info();
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index b6f21004f..ad5992602 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -427,15 +427,9 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
   for (auto idx : nidx) {
     copy_stream_.View().Wait(event);
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(
         h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
         d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(
-        h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
-        d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
-#endif
   }
 }
 
@@ -516,13 +510,8 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
                        dh::ToSpan(out_entries));
   GPUExpandEntry root_entry;
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                 cudaMemcpyDeviceToHost));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
-                                hipMemcpyDeviceToHost));
-#endif
   return root_entry;
 }
 }  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 2cbe13a22..b23cb670b 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -59,13 +59,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
     split_cats_.resize(node_categorical_storage_size_);
     h_split_cats_.resize(node_categorical_storage_size_);
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(
         cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(
-        hipMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
-#endif
 
     cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
     sort_input_.resize(cat_sorted_idx_.size());
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 106524c56..e52977065 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -266,11 +266,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
   // decide whether to use shared memory
   int device = 0;
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaGetDevice(&device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipGetDevice(&device));
-#endif
 
   // opt into maximum shared memory for the kernel if necessary
 #if defined(XGBOOST_USE_CUDA)
@@ -303,17 +299,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
     int num_groups = feature_groups.NumGroups();
     int n_mps = 0;
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
     int n_blocks_per_mp = 0;
     dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                                 kBlockThreads, smem_size));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipDeviceGetAttribute(&n_mps, hipDeviceAttributeMultiprocessorCount, device));
-    int n_blocks_per_mp = 0;
-    dh::safe_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
-                                                                kBlockThreads, smem_size));
-#endif
 
     // This gives the number of blocks to keep the device occupied
     // Use this as the maximum number of blocks
@@ -347,11 +336,7 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
     runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
   }
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaGetLastError());
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipGetLastError());
-#endif
 }
 
 }  // namespace tree
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index b1c738142..b1ded6cda 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -16,22 +16,14 @@ namespace tree {
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
     : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_idx_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_idx_));
-#endif
 
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
 }
 
 RowPartitioner::~RowPartitioner() {
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaSetDevice(device_idx_));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device_idx_));
-#endif
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 38938e848..74f0dee2b 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -287,15 +287,9 @@ class RowPartitioner {
       total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
     }
 
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
-                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  hipMemcpyDefault));
-#else
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                   h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
                                   cudaMemcpyDefault));
-#endif
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
@@ -305,13 +299,8 @@ class RowPartitioner {
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
         dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
         total_rows, op, &tmp_);
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
                                   cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
-                                  hipMemcpyDefault));
-#endif
     // TODO(Rory): this synchronisation hurts performance a lot
     // Future optimisation should find a way to skip this
     dh::DefaultStream().Sync();
@@ -348,15 +337,9 @@ class RowPartitioner {
   void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
 
-#if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
-                                  sizeof(NodePositionInfo) * ridx_segments_.size(),
-                                  hipMemcpyDefault));
-#else
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
                                   cudaMemcpyDefault));
-#endif
 
     constexpr int kBlockSize = 512;
     const int kItemsThread = 8;
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 6e06450fb..58074a79e 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -232,26 +232,16 @@ struct GPUHistMakerDevice {
     this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
                                 param.colsample_bynode, param.colsample_bylevel,
                                 param.colsample_bytree);
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
 
     this->interaction_constraints.Reset();
 
     if (d_gpair.size() != dh_gpair->Size()) {
       d_gpair.resize(dh_gpair->Size());
     }
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
                                   dh_gpair->Size() * sizeof(GradientPair),
                                   cudaMemcpyDeviceToDevice));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
-                                  dh_gpair->Size() * sizeof(GradientPair),
-                                  hipMemcpyDeviceToDevice));
-#endif
     auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
     page = sample.page;
     gpair = sample.gpair;
@@ -338,28 +328,15 @@ struct GPUHistMakerDevice {
       max_active_features =
           std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
     }
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(
         d_node_inputs.data().get(), h_node_inputs.data(),
         h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(
-        d_node_inputs.data().get(), h_node_inputs.data(),
-        h_node_inputs.size() * sizeof(EvaluateSplitInputs), hipMemcpyDefault));
-#endif
 
     this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
                                     shared_inputs, dh::ToSpan(entries));
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                   entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
                                   cudaMemcpyDeviceToHost));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(pinned_candidates_out.data(),
-                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
-                                  hipMemcpyDeviceToHost));
-#endif
-
     dh::DefaultStream().Sync();
   }
 
@@ -412,13 +389,8 @@ struct GPUHistMakerDevice {
     BitVector missing_bits{dh::ToSpan(missing_storage)};
 
     dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
                                   num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(split_data_storage.data().get(), split_data.data(),
-                                  num_candidates * sizeof(NodeSplitData), hipMemcpyDefault));
-#endif
     auto d_split_data = dh::ToSpan(split_data_storage);
 
     dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
@@ -527,15 +499,9 @@ struct GPUHistMakerDevice {
 
     dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
                                   d_nodes.size() * sizeof(RegTree::Node),
                                   cudaMemcpyHostToDevice));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
-                                  d_nodes.size() * sizeof(RegTree::Node),
-                                  hipMemcpyHostToDevice));
-#endif
 
     auto const& h_split_types = p_tree->GetSplitTypes();
     auto const& categories = p_tree->GetSplitCategories();
@@ -606,15 +572,9 @@ struct GPUHistMakerDevice {
     auto s_position = p_out_position->ConstDeviceSpan();
     positions.resize(s_position.size());
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
                                   s_position.size_bytes(), cudaMemcpyDeviceToDevice,
                                   ctx_->CUDACtx()->Stream()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(positions.data().get(), s_position.data(),
-                                  s_position.size_bytes(), hipMemcpyDeviceToDevice,
-                                  ctx_->CUDACtx()->Stream()));
-#endif
 
     dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
       bst_node_t position = d_out_position[idx];
@@ -632,26 +592,16 @@ struct GPUHistMakerDevice {
     CHECK(out_preds_d.Device().IsCUDA());
     CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->Ordinal()));
-#endif
     auto d_position = dh::ToSpan(positions);
     CHECK_EQ(out_preds_d.Size(), d_position.size());
 
     auto const& h_nodes = p_tree->GetNodes();
     dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
 
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
                                   h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice,
                                   ctx_->CUDACtx()->Stream()));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMemcpyAsync(nodes.data().get(), h_nodes.data(),
-                                  h_nodes.size() * sizeof(RegTree::Node), hipMemcpyHostToDevice,
-                                  ctx_->CUDACtx()->Stream()));
-#endif
 
     auto d_nodes = dh::ToSpan(nodes);
     CHECK_EQ(out_preds_d.Shape(1), 1);
@@ -904,11 +854,7 @@ class GPUHistMaker : public TreeUpdater {
         ++t_idx;
       }
 
-#if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaGetLastError());
-#elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipGetLastError());
-#endif
     } catch (const std::exception& e) {
       LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
     }
@@ -925,11 +871,7 @@ class GPUHistMaker : public TreeUpdater {
     this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
     auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
-#endif
 
     info_->feature_types.SetDevice(ctx_->gpu_id);
     maker = std::make_unique<GPUHistMakerDevice>(