From f55243fda0af0b0d42a9eba330a8b841580d7268 Mon Sep 17 00:00:00 2001
From: amdsc21 <96135754+amdsc21@users.noreply.github.com>
Date: Thu, 9 Mar 2023 22:15:10 +0100
Subject: [PATCH] finish evaluate_splits.cu

---
 CMakeLists.txt                        |  2 +-
 src/common/cuda_pinned_allocator.h    |  4 +--
 src/common/transform.h                |  4 ++-
 src/tree/gpu_hist/evaluate_splits.cu  | 49 +++++++++++++++++++++++++--
 src/tree/gpu_hist/evaluate_splits.hip |  4 +++
 src/tree/split_evaluator.h            |  4 ++-
 src/tree/updater_gpu_common.cuh       | 14 ++++++++
 7 files changed, 73 insertions(+), 8 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df520dff4..fa26a1aba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,7 +197,7 @@ if (USE_HIP)
   find_package(hipcub REQUIRED)
 
   set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
-  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result")
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
   set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -D__HIP_PLATFORM_AMD__")
   add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
 
diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
index a5152c8a0..11a942de3 100644
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@@ -74,7 +74,7 @@ class pinned_allocator {
     pointer result(nullptr);
 
 #if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+    dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
 #else
     dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
 #endif
@@ -84,7 +84,7 @@ class pinned_allocator {
 
   inline void deallocate(pointer p, size_type) {
 #if defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipFreeHost(p));
+      dh::safe_cuda(hipHostFree(p));
 #else
       dh::safe_cuda(cudaFreeHost(p));
 #endif
diff --git a/src/common/transform.h b/src/common/transform.h
index 974ee86d6..389ff7f6e 100644
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -17,8 +17,10 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/span.h"
 
-#if defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
+#if defined (__CUDACC__)
 #include "device_helpers.cuh"
+#elif defined(__HIP_PLATFORM_AMD__)
+#include "device_helpers.hip.h"
 #endif  // defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
 
 namespace xgboost {
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index c48c8ddf3..b898a8642 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -6,12 +6,22 @@
 #include <limits>
 
 #include "../../common/categorical.h"
+
+#if defined(XGBOOST_USE_CUDA)
 #include "../../common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../../common/device_helpers.hip.h"
+#endif
+
 #include "../../data/ellpack_page.cuh"
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
 
 namespace xgboost {
+#if defined(XGBOOST_USE_HIP)
+namespace cub = hipcub;
+#endif
+
 namespace tree {
 
 // With constraints
@@ -99,8 +109,13 @@ class EvaluateSplitAgent {
     }
     local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);  // NOLINT
     // Broadcast result from thread 0
+#if defined(XGBOOST_USE_CUDA)
     return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
             __shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
+#elif defined(XGBOOST_USE_HIP)
+    return {__shfl(local_sum.GetQuantisedGrad(), 0),
+            __shfl(local_sum.GetQuantisedHess(), 0)};
+#endif
   }
 
   // Load using efficient 128 vector load instruction
@@ -124,10 +139,15 @@ class EvaluateSplitAgent {
                                                      evaluator, missing_left, rounding)
                                  : kNullGain;
       // Find thread with best gain
-      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
+
       // This reduce result is only valid in thread 0
       // broadcast to the rest of the warp
+#if defined(XGBOOST_USE_CUDA)
       auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+#elif defined(XGBOOST_USE_HIP)
+      auto best_thread = __shfl(best.key, 0);
+#endif
 
       // Best thread updates the split
       if (threadIdx.x == best_thread) {
@@ -157,10 +177,15 @@ class EvaluateSplitAgent {
                                  : kNullGain;
 
       // Find thread with best gain
-      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
       // This reduce result is only valid in thread 0
       // broadcast to the rest of the warp
+#if defined(XGBOOST_USE_CUDA)
       auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+#elif defined(XGBOOST_USE_HIP)
+      auto best_thread = __shfl(best.key, 0);
+#endif
+
       // Best thread updates the split
       if (threadIdx.x == best_thread) {
         int32_t split_gidx = (scan_begin + threadIdx.x);
@@ -186,10 +211,15 @@ class EvaluateSplitAgent {
                     : kNullGain;
 
     // Find thread with best gain
-    auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+    auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
     // This reduce result is only valid in thread 0
     // broadcast to the rest of the warp
+#if defined(XGBOOST_USE_CUDA)
     auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+#elif defined(XGBOOST_USE_HIP)
+    auto best_thread = __shfl(best.key, 0);
+#endif
+
     // Best thread updates the split
     if (threadIdx.x == best_thread) {
       assert(thread_active);
@@ -391,9 +421,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
   event.Record(dh::DefaultStream());
   for (auto idx : nidx) {
     copy_stream_.View().Wait(event);
+
+#if defined(XGBOOST_USE_CUDA)
     dh::safe_cuda(cudaMemcpyAsync(
         h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
         d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
+#elif defined(XGBOOST_USE_HIP)
+    dh::safe_cuda(hipMemcpyAsync(
+        h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
+        d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
+#endif
   }
 }
 
@@ -456,8 +493,14 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
   this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
                        dh::ToSpan(out_entries));
   GPUExpandEntry root_entry;
+
+#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                 cudaMemcpyDeviceToHost));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
+                                hipMemcpyDeviceToHost));
+#endif
   return root_entry;
 }
 
diff --git a/src/tree/gpu_hist/evaluate_splits.hip b/src/tree/gpu_hist/evaluate_splits.hip
index e69de29bb..4469d1c1f 100644
--- a/src/tree/gpu_hist/evaluate_splits.hip
+++ b/src/tree/gpu_hist/evaluate_splits.hip
@@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "evaluate_splits.cu"
+#endif
diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h
index b6625339d..4ca90b481 100644
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -121,8 +121,10 @@ class TreeEvaluator {
 
     // Fast floating point division instruction on device
     XGBOOST_DEVICE float Divide(float a, float b) const {
-#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
+#if defined(__CUDA_ARCH__)
       return __fdividef(a, b);
+#elif defined(__HIP_PLATFORM_AMD__)
+      return a / b;
 #else
       return a / b;
 #endif
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index 1637300b6..8e15e90bb 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -4,12 +4,26 @@
 #pragma once
 #include <thrust/random.h>
 #include <cstdio>
+#include <limits>
+#include <float.h>
+
+#if defined(XGBOOST_USE_CUDA)
 #include <cub/cub.cuh>
+#elif defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>
+#endif
+
 #include <stdexcept>
 #include <string>
 #include <vector>
 #include "../common/categorical.h"
+
+#if defined(XGBOOST_USE_CUDA)
 #include "../common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../common/device_helpers.hip.h"
+#endif
+
 #include "../common/random.h"
 #include "gpu_hist/histogram.cuh"
 #include "param.h"