diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index b1e903821..6168388f0 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -36,21 +36,21 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
 
  private:
   static constexpr std::size_t kUuidLength =
-#if defined(XGBOOST_USE_HIP)
-      sizeof(hipUUID) / sizeof(uint64_t);
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
       sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
+#elif defined(XGBOOST_USE_HIP)
+      sizeof(hipUUID) / sizeof(uint64_t);
 #endif
 
   void GetCudaUUID(xgboost::common::Span<uint64_t, kUuidLength> const &uuid) const {
-#if defined(XGBOOST_USE_HIP)
-    hipUUID id;
-    hipDeviceGetUuid(&id, device_ordinal_);
-    std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
     cudaDeviceProp prob{};
     dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_));
     std::memcpy(uuid.data(), static_cast<void *>(&(prob.uuid)), sizeof(prob.uuid));
+#elif defined(XGBOOST_USE_HIP)
+    hipUUID id;
+    hipDeviceGetUuid(&id, device_ordinal_);
+    std::memcpy(uuid.data(), static_cast<void *>(&id), sizeof(id));
 #endif
   }
 
diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index 8bf6bb808..2d80c06d8 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -11,10 +11,10 @@
 #include <cstddef>             // size_t
 #include <cstdint>             // int32_t
 
-#if defined(XGBOOST_USE_HIP)
-#include <hipcub/hipcub.hpp>
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
 #include <cub/cub.cuh>         // DispatchSegmentedRadixSort,NullType,DoubleBuffer
+#elif defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>
 #endif
 
 #include <iterator>            // distance
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index feddba99e..c4112ee13 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -175,17 +175,17 @@ void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cu
         return thrust::min(num_cuts_per_feature, column_size);
       });
 
-#if defined(XGBOOST_USE_HIP)
+#if defined(XGBOOST_USE_CUDA)
+  thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
+                         cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
+  thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
+                         column_sizes_scan->end(), column_sizes_scan->begin());
+#elif defined(XGBOOST_USE_HIP)
   thrust::exclusive_scan(thrust::hip::par(alloc), cut_ptr_it,
                          cut_ptr_it + column_sizes_scan->size(),
                          cuts_ptr->DevicePointer());
   thrust::exclusive_scan(thrust::hip::par(alloc), column_sizes_scan->begin(),
                          column_sizes_scan->end(), column_sizes_scan->begin());
-#elif defined(XGBOOST_USE_CUDA)
-  thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
-                         cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
-  thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
-                         column_sizes_scan->end(), column_sizes_scan->begin());
 #endif
 }
 
@@ -309,12 +309,12 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
                                  &sorted_entries);
   dh::XGBDeviceAllocator<char> alloc;
 
-#if defined(XGBOOST_USE_HIP)
-  thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(),
-               sorted_entries.end(), detail::EntryCompareOp());
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
   thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
                sorted_entries.end(), detail::EntryCompareOp());
+#elif defined(XGBOOST_USE_HIP)
+  thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(),
+               sorted_entries.end(), detail::EntryCompareOp());
 #endif
 
   if (sketch_container->HasCategorical()) {
@@ -374,14 +374,14 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
           return weights[group_idx];
         });
 
-#if defined(XGBOOST_USE_HIP)
-    auto retit = thrust::copy_if(thrust::hip::par(alloc),
+#if defined(XGBOOST_USE_CUDA)
+    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
                                  weight_iter + begin, weight_iter + end,
                                  batch_iter + begin,
                                  d_temp_weights.data(),  // output
                                  is_valid);
-#elif defined(XGBOOST_USE_CUDA)
-    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
+#elif defined(XGBOOST_USE_HIP)
+    auto retit = thrust::copy_if(thrust::hip::par(alloc),
                                  weight_iter + begin, weight_iter + end,
                                  batch_iter + begin,
                                  d_temp_weights.data(),  // output
@@ -397,14 +397,14 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
           return weights[batch.GetElement(idx).row_idx];
         });
 
-#if defined(XGBOOST_USE_HIP)
-    auto retit = thrust::copy_if(thrust::hip::par(alloc),
+#if defined(XGBOOST_USE_CUDA)
+    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
                                  weight_iter + begin, weight_iter + end,
                                  batch_iter + begin,
                                  d_temp_weights.data(),  // output
                                  is_valid);
-#elif defined(XGBOOST_USE_CUDA)
-    auto retit = thrust::copy_if(thrust::cuda::par(alloc),
+#elif defined(XGBOOST_USE_HIP)
+    auto retit = thrust::copy_if(thrust::hip::par(alloc),
                                  weight_iter + begin, weight_iter + end,
                                  batch_iter + begin,
                                  d_temp_weights.data(),  // output
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 1eaa15c70..fac254abf 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -184,15 +184,15 @@ class SketchContainer {
 
     d_column_scan = this->columns_ptr_.DeviceSpan();
 
-#if defined(XGBOOST_USE_HIP)
+#if defined(XGBOOST_USE_CUDA)
     size_t n_uniques = dh::SegmentedUnique(
-        thrust::hip::par(alloc), d_column_scan.data(),
+        thrust::cuda::par(alloc), d_column_scan.data(),
         d_column_scan.data() + d_column_scan.size(), entries.data(),
         entries.data() + entries.size(), scan_out.DevicePointer(),
         entries.data(), detail::SketchUnique{}, key_comp);
-#elif defined(XGBOOST_USE_CUDA)
+#elif defined(XGBOOST_USE_HIP)
     size_t n_uniques = dh::SegmentedUnique(
-        thrust::cuda::par(alloc), d_column_scan.data(),
+        thrust::hip::par(alloc), d_column_scan.data(),
         d_column_scan.data() + d_column_scan.size(), entries.data(),
         entries.data() + entries.size(), scan_out.DevicePointer(),
         entries.data(), detail::SketchUnique{}, key_comp);
diff --git a/src/common/stats.cuh b/src/common/stats.cuh
index d61adc41a..0de654818 100644
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -217,12 +217,12 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
   auto scan_val = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                                    detail::WeightOp<WIter>{w_begin, d_sorted_idx});
 
-#if defined(XGBOOST_USE_HIP)
-  thrust::inclusive_scan_by_key(thrust::hip::par(caching), scan_key, scan_key + n_weights,
-                                scan_val, weights_cdf.begin());
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
   thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights,
                                 scan_val, weights_cdf.begin());
+#elif defined(XGBOOST_USE_HIP)
+  thrust::inclusive_scan_by_key(thrust::hip::par(caching), scan_key, scan_key + n_weights,
+                                scan_val, weights_cdf.begin());
 #endif
 
   auto n_segments = std::distance(seg_beg, seg_end) - 1;
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 0c24a4829..abbc4e944 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -6,10 +6,10 @@
 #include <algorithm>
 #include <cassert>
 
-#if defined(XGBOOST_USE_HIP)
-#include <hipcub/hipcub.hpp>  // NOLINT
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
 #include <cub/cub.cuh>  // NOLINT
+#elif defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>  // NOLINT
 #endif
 
 #include <limits>
@@ -127,16 +127,16 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
       thrust::make_counting_iterator(0),
       [=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; });
 
-#if defined(XGBOOST_USE_HIP)
-  auto end_unique = thrust::unique_by_key_copy(
-      thrust::hip::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
-      dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
-      dh::tbegin(d_unique_idx));
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
   auto end_unique = thrust::unique_by_key_copy(
       thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
       dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
       dh::tbegin(d_unique_idx));
+#elif defined(XGBOOST_USE_HIP)
+  auto end_unique = thrust::unique_by_key_copy(
+      thrust::hip::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
+      dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
+      dh::tbegin(d_unique_idx));
 #endif
 
   d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));
@@ -179,10 +179,10 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
 
   Pair last = cache->fptp.back();
 
-#if defined(XGBOOST_USE_HIP)
-  double auc = thrust::reduce(thrust::hip::par(alloc), in, in + d_unique_idx.size());
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
   double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size());
+#elif defined(XGBOOST_USE_HIP)
+  double auc = thrust::reduce(thrust::hip::par(alloc), in, in + d_unique_idx.size());
 #endif
 
   return std::make_tuple(last.first, last.second, auc);
@@ -239,14 +239,14 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
   double tp_sum;
   double auc_sum;
 
-#if defined(XGBOOST_USE_HIP)
-  thrust::tie(auc_sum, tp_sum) =
-      thrust::reduce(thrust::hip::par(alloc), reduce_in, reduce_in + n_classes,
-                     Pair{0.0, 0.0}, PairPlus<double, double>{});
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
   thrust::tie(auc_sum, tp_sum) =
       thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
                      Pair{0.0, 0.0}, PairPlus<double, double>{});
+#elif defined(XGBOOST_USE_HIP)
+  thrust::tie(auc_sum, tp_sum) =
+      thrust::reduce(thrust::hip::par(alloc), reduce_in, reduce_in + n_classes,
+                     Pair{0.0, 0.0}, PairPlus<double, double>{});
 #endif
 
   if (tp_sum != 0 && !std::isnan(auc_sum)) {
@@ -329,12 +329,12 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
         return auc;
       });
 
-#if defined(XGBOOST_USE_HIP)
-  thrust::reduce_by_key(thrust::hip::par(alloc), key_in,
+#if defined(XGBOOST_USE_CUDA)
+  thrust::reduce_by_key(thrust::cuda::par(alloc), key_in,
                         key_in + d_unique_idx.size(), val_in,
                         thrust::make_discard_iterator(), dh::tbegin(d_auc));
-#elif defined(XGBOOST_USE_CUDA)
-  thrust::reduce_by_key(thrust::cuda::par(alloc), key_in,
+#elif defined(XGBOOST_USE_HIP)
+  thrust::reduce_by_key(thrust::hip::par(alloc), key_in,
                         key_in + d_unique_idx.size(), val_in,
                         thrust::make_discard_iterator(), dh::tbegin(d_auc));
 #endif
@@ -410,9 +410,9 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
   dh::TemporaryArray<uint32_t> unique_class_ptr(d_class_ptr.size());
   auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
 
-#if defined(XGBOOST_USE_HIP)
+#if defined(XGBOOST_USE_CUDA)
   auto n_uniques = dh::SegmentedUniqueByKey(
-      thrust::hip::par(alloc),
+      thrust::cuda::par(alloc),
       dh::tbegin(d_class_ptr),
       dh::tend(d_class_ptr),
       uni_key,
@@ -421,9 +421,9 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
       d_unique_class_ptr.data(),
       dh::tbegin(d_unique_idx),
       thrust::equal_to<thrust::pair<uint32_t, float>>{});
-#elif defined(XGBOOST_USE_CUDA)
+#elif defined(XGBOOST_USE_HIP)
   auto n_uniques = dh::SegmentedUniqueByKey(
-      thrust::cuda::par(alloc),
+      thrust::hip::par(alloc),
       dh::tbegin(d_class_ptr),
       dh::tend(d_class_ptr),
       uni_key,
@@ -553,14 +553,14 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
       thrust::make_counting_iterator(0),
       [=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; });
 
-#if defined(XGBOOST_USE_HIP)
-  size_t n_valid = thrust::count_if(
-      thrust::hip::par(alloc), check_it, check_it + group_ptr.size() - 1,
-      [=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
   size_t n_valid = thrust::count_if(
       thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1,
       [=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
+#elif defined(XGBOOST_USE_HIP)
+  size_t n_valid = thrust::count_if(
+      thrust::hip::par(alloc), check_it, check_it + group_ptr.size() - 1,
+      [=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
 #endif
 
   if (n_valid < info.group_ptr_.size() - 1) {
@@ -659,12 +659,12 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
   /**
    * Scale the AUC with number of items in each group.
    */
-#if defined(XGBOOST_USE_HIP)
-  double auc = thrust::reduce(thrust::hip::par(alloc), dh::tbegin(s_d_auc),
-                              dh::tend(s_d_auc), 0.0);
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
   double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc),
                               dh::tend(s_d_auc), 0.0);
+#elif defined(XGBOOST_USE_HIP)
+  double auc = thrust::reduce(thrust::hip::par(alloc), dh::tbegin(s_d_auc),
+                              dh::tend(s_d_auc), 0.0);
 #endif
 
   return std::make_pair(auc, n_valid);
@@ -694,14 +694,14 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
   dh::XGBCachingDeviceAllocator<char> alloc;
   double total_pos, total_neg;
 
-#if defined(XGBOOST_USE_HIP)
-  thrust::tie(total_pos, total_neg) =
-      thrust::reduce(thrust::hip::par(alloc), it, it + labels.Size(),
-                     Pair{0.0, 0.0}, PairPlus<double, double>{});
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
   thrust::tie(total_pos, total_neg) =
       thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(),
                      Pair{0.0, 0.0}, PairPlus<double, double>{});
+#elif defined(XGBOOST_USE_HIP)
+  thrust::tie(total_pos, total_neg) =
+      thrust::reduce(thrust::hip::par(alloc), it, it + labels.Size(),
+                     Pair{0.0, 0.0}, PairPlus<double, double>{});
 #endif
 
   if (total_pos <= 0.0 || total_neg <= 0.0) {
@@ -755,13 +755,13 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
       });
   dh::XGBCachingDeviceAllocator<char> alloc;
 
-#if defined(XGBOOST_USE_HIP)
-  thrust::reduce_by_key(thrust::hip::par(alloc), key_it,
+#if defined(XGBOOST_USE_CUDA)
+  thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
                         key_it + predts.size(), val_it,
                         thrust::make_discard_iterator(), totals.begin(),
                         thrust::equal_to<size_t>{}, PairPlus<double, double>{});
-#elif defined(XGBOOST_USE_CUDA)
-  thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
+#elif defined(XGBOOST_USE_HIP)
+  thrust::reduce_by_key(thrust::hip::par(alloc), key_it,
                         key_it + predts.size(), val_it,
                         thrust::make_discard_iterator(), totals.begin(),
                         thrust::equal_to<size_t>{}, PairPlus<double, double>{});
@@ -834,9 +834,9 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
   dh::TemporaryArray<uint32_t> unique_class_ptr(d_group_ptr.size());
   auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
 
-#if defined(XGBOOST_USE_HIP)
+#if defined(XGBOOST_USE_CUDA)
   auto n_uniques = dh::SegmentedUniqueByKey(
-      thrust::hip::par(alloc),
+      thrust::cuda::par(alloc),
       dh::tbegin(d_group_ptr),
       dh::tend(d_group_ptr),
       uni_key,
@@ -845,9 +845,9 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
       d_unique_class_ptr.data(),
       dh::tbegin(d_unique_idx),
       thrust::equal_to<thrust::pair<uint32_t, float>>{});
-#elif defined(XGBOOST_USE_CUDA)
+#elif defined(XGBOOST_USE_HIP)
   auto n_uniques = dh::SegmentedUniqueByKey(
-      thrust::cuda::par(alloc),
+      thrust::hip::par(alloc),
       dh::tbegin(d_group_ptr),
       dh::tend(d_group_ptr),
       uni_key,
@@ -909,14 +909,14 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
           return thrust::make_pair(0.0, static_cast<uint32_t>(1));
         });
 
-#if defined(XGBOOST_USE_HIP)
-    thrust::tie(auc, invalid_groups) = thrust::reduce(
-        thrust::hip::par(alloc), it, it + n_groups,
-        thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
-#elif defined(XGBOOST_USE_CUDA)
+#if defined(XGBOOST_USE_CUDA)
     thrust::tie(auc, invalid_groups) = thrust::reduce(
         thrust::cuda::par(alloc), it, it + n_groups,
         thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
+#elif defined(XGBOOST_USE_HIP)
+    thrust::tie(auc, invalid_groups) = thrust::reduce(
+        thrust::hip::par(alloc), it, it + n_groups,
+        thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
 #endif
   }
   return std::make_pair(auc, n_groups - invalid_groups);
@@ -949,13 +949,13 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
   dh::XGBDeviceAllocator<char> alloc;
   auto labels = info.labels.View(ctx->Device());
 
-#if defined(XGBOOST_USE_HIP)
-  if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()),
+#if defined(XGBOOST_USE_CUDA)
+  if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
                      dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
     InvalidLabels();
   }
-#elif defined(XGBOOST_USE_CUDA)
-  if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
+#elif defined(XGBOOST_USE_HIP)
+  if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()),
                      dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
     InvalidLabels();
   }
@@ -981,13 +981,13 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
         return thrust::make_pair(y * w, (1.0 - y) * w);
       });
 
-#if defined(XGBOOST_USE_HIP)
-  thrust::reduce_by_key(thrust::hip::par(alloc), key_it,
+#if defined(XGBOOST_USE_CUDA)
+  thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
                         key_it + predts.size(), val_it,
                         thrust::make_discard_iterator(), totals.begin(),
                         thrust::equal_to<size_t>{}, PairPlus<double, double>{});
-#elif defined(XGBOOST_USE_CUDA)
-  thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
+#elif defined(XGBOOST_USE_HIP)
+  thrust::reduce_by_key(thrust::hip::par(alloc), key_it,
                         key_it + predts.size(), val_it,
                         thrust::make_discard_iterator(), totals.begin(),
                         thrust::equal_to<size_t>{}, PairPlus<double, double>{});
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index 937e31400..f52b28fd1 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -62,6 +62,21 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
           return PackedReduceResult{v, wt};
         },
         PackedReduceResult{}, thrust::plus<PackedReduceResult>());
+#elif defined(XGBOOST_USE_HIP)
+    dh::XGBCachingDeviceAllocator<char> alloc;
+    thrust::counting_iterator<size_t> begin(0);
+    thrust::counting_iterator<size_t> end = begin + labels.Size();
+    result = thrust::transform_reduce(
+        thrust::hip::par(alloc), begin, end,
+        [=] XGBOOST_DEVICE(size_t i) {
+          auto idx = linalg::UnravelIndex(i, labels.Shape());
+          auto sample_id = std::get<0>(idx);
+          auto target_id = std::get<1>(idx);
+          auto res = loss(i, sample_id, target_id);
+          float v{std::get<0>(res)}, wt{std::get<1>(res)};
+          return PackedReduceResult{v, wt};
+        },
+        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
 #else
     common::AssertGPUSupport();
 #endif  //  defined(XGBOOST_USE_CUDA)
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 542a7b6a5..70cbca529 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -11,7 +11,9 @@
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
 
-#if defined(XGBOOST_USE_HIP)
+#if defined(XGBOOST_USE_CUDA)
+#define WARP_SIZE 32
+#elif defined(XGBOOST_USE_HIP)
 #include <hip/hip_cooperative_groups.h>
 
 #ifdef __AMDGCN_WAVEFRONT_SIZE
@@ -20,8 +22,6 @@
 #endif
 
 #define WARP_SIZE WAVEFRONT_SIZE
-#elif defined(XGBOOST_USE_CUDA)
-#define WARP_SIZE 32
 #endif
 
 #if defined(XGBOOST_USE_HIP)