finish evaluate_splits.cu

2023-03-09 22:15:10 +01:00
parent 1e09c21456
commit f55243fda0
7 changed files with 73 additions and 8 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,7 +197,7 @@ if (USE_HIP)
  find_package(hipcub REQUIRED)

  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
-  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result")
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -D__HIP_PLATFORM_AMD__")
  add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)

--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@@ -74,7 +74,7 @@ class pinned_allocator {
    pointer result(nullptr);

 #if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+    dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
 #else
    dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
 #endif
@@ -84,7 +84,7 @@ class pinned_allocator {

  inline void deallocate(pointer p, size_type) {
 #if defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipFreeHost(p));
+      dh::safe_cuda(hipHostFree(p));
 #else
      dh::safe_cuda(cudaFreeHost(p));
 #endif
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -17,8 +17,10 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/span.h"

-#if defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
+#if defined (__CUDACC__)
 #include "device_helpers.cuh"
+#elif defined(__HIP_PLATFORM_AMD__)
+#include "device_helpers.hip.h"
 #endif  // defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)

 namespace xgboost {
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -6,12 +6,22 @@
 #include <limits>

 #include "../../common/categorical.h"
+
+#if defined(XGBOOST_USE_CUDA)
 #include "../../common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../../common/device_helpers.hip.h"
+#endif
+
 #include "../../data/ellpack_page.cuh"
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"

 namespace xgboost {
+#if defined(XGBOOST_USE_HIP)
+namespace cub = hipcub;
+#endif
+
 namespace tree {

 // With constraints
@@ -99,8 +109,13 @@ class EvaluateSplitAgent {
    }
    local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);  // NOLINT
    // Broadcast result from thread 0
+#if defined(XGBOOST_USE_CUDA)
    return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
            __shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
+#elif defined(XGBOOST_USE_HIP)
+    return {__shfl(local_sum.GetQuantisedGrad(), 0),
+            __shfl(local_sum.GetQuantisedHess(), 0)};
+#endif
  }

  // Load using efficient 128 vector load instruction
@@ -124,10 +139,15 @@ class EvaluateSplitAgent {
                                                     evaluator, missing_left, rounding)
                                 : kNullGain;
      // Find thread with best gain
-      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
+
      // This reduce result is only valid in thread 0
      // broadcast to the rest of the warp
+#if defined(XGBOOST_USE_CUDA)
      auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+#elif defined(XGBOOST_USE_HIP)
+      auto best_thread = __shfl(best.key, 0);
+#endif

      // Best thread updates the split
      if (threadIdx.x == best_thread) {
@@ -157,10 +177,15 @@ class EvaluateSplitAgent {
                                 : kNullGain;

      // Find thread with best gain
-      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
      // This reduce result is only valid in thread 0
      // broadcast to the rest of the warp
+#if defined(XGBOOST_USE_CUDA)
      auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+#elif defined(XGBOOST_USE_HIP)
+      auto best_thread = __shfl(best.key, 0);
+#endif
+
      // Best thread updates the split
      if (threadIdx.x == best_thread) {
        int32_t split_gidx = (scan_begin + threadIdx.x);
@@ -186,10 +211,15 @@ class EvaluateSplitAgent {
                    : kNullGain;

    // Find thread with best gain
-    auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+    auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
    // This reduce result is only valid in thread 0
    // broadcast to the rest of the warp
+#if defined(XGBOOST_USE_CUDA)
    auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+#elif defined(XGBOOST_USE_HIP)
+    auto best_thread = __shfl(best.key, 0);
+#endif
+
    // Best thread updates the split
    if (threadIdx.x == best_thread) {
      assert(thread_active);
@@ -391,9 +421,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
  event.Record(dh::DefaultStream());
  for (auto idx : nidx) {
    copy_stream_.View().Wait(event);
+
+#if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(
        h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
        d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
+#elif defined(XGBOOST_USE_HIP)
+    dh::safe_cuda(hipMemcpyAsync(
+        h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
+        d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
+#endif
  }
 }

@@ -456,8 +493,14 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
  this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
                       dh::ToSpan(out_entries));
  GPUExpandEntry root_entry;
+
+#if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                cudaMemcpyDeviceToHost));
+#elif defined(XGBOOST_USE_HIP)
+  dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
+                                hipMemcpyDeviceToHost));
+#endif
  return root_entry;
 }

--- a/src/tree/gpu_hist/evaluate_splits.hip
+++ b/src/tree/gpu_hist/evaluate_splits.hip
@@ -0,0 +1,4 @@
+
+#if defined(XGBOOST_USE_HIP)
+#include "evaluate_splits.cu"
+#endif
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -121,8 +121,10 @@ class TreeEvaluator {

    // Fast floating point division instruction on device
    XGBOOST_DEVICE float Divide(float a, float b) const {
-#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
+#if defined(__CUDA_ARCH__)
      return __fdividef(a, b);
+#elif defined(__HIP_PLATFORM_AMD__)
+      return a / b;
 #else
      return a / b;
 #endif
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -4,12 +4,26 @@
 #pragma once
 #include <thrust/random.h>
 #include <cstdio>
+#include <limits>
+#include <float.h>
+
+#if defined(XGBOOST_USE_CUDA)
 #include <cub/cub.cuh>
+#elif defined(XGBOOST_USE_HIP)
+#include <hipcub/hipcub.hpp>
+#endif
+
 #include <stdexcept>
 #include <string>
 #include <vector>
 #include "../common/categorical.h"
+
+#if defined(XGBOOST_USE_CUDA)
 #include "../common/device_helpers.cuh"
+#elif defined(XGBOOST_USE_HIP)
+#include "../common/device_helpers.hip.h"
+#endif
+
 #include "../common/random.h"
 #include "gpu_hist/histogram.cuh"
 #include "param.h"