finish evaluate_splits.cu

2023-03-09 22:15:10 +01:00 · 2023-03-09 22:15:10 +01:00 · f55243fda0
commit f55243fda0
parent 1e09c21456
7 changed files with 73 additions and 8 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -197,7 +197,7 @@ if (USE_HIP)
  find_package(hipcub REQUIRED)
  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
-  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result")
+  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
  set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -D__HIP_PLATFORM_AMD__")
  add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@ -74,7 +74,7 @@ class pinned_allocator {
    pointer result(nullptr);
 #if defined(XGBOOST_USE_HIP)
-    dh::safe_cuda(hipMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+    dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
 #else
    dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
 #endif
@ -84,7 +84,7 @@ class pinned_allocator {
  inline void deallocate(pointer p, size_type) {
 #if defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipFreeHost(p));
+      dh::safe_cuda(hipHostFree(p));
 #else
      dh::safe_cuda(cudaFreeHost(p));
 #endif
--- a/src/common/transform.h
+++ b/src/common/transform.h
@ -17,8 +17,10 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/span.h"
-#if defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
+#if defined (__CUDACC__)
 #include "device_helpers.cuh"
 #elif defined(__HIP_PLATFORM_AMD__)
 #include "device_helpers.hip.h"
 #endif  // defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
 namespace xgboost {
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@ -6,12 +6,22 @@
 #include <limits>
 #include "../../common/categorical.h"
 #if defined(XGBOOST_USE_CUDA)
 #include "../../common/device_helpers.cuh"
 #elif defined(XGBOOST_USE_HIP)
 #include "../../common/device_helpers.hip.h"
 #endif
 #include "../../data/ellpack_page.cuh"
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
 namespace xgboost {
 #if defined(XGBOOST_USE_HIP)
 namespace cub = hipcub;
 #endif
 namespace tree {
 // With constraints
@ -99,8 +109,13 @@ class EvaluateSplitAgent {
    }
    local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);  // NOLINT
    // Broadcast result from thread 0
 #if defined(XGBOOST_USE_CUDA)
    return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
            __shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
 #elif defined(XGBOOST_USE_HIP)
    return {__shfl(local_sum.GetQuantisedGrad(), 0),
            __shfl(local_sum.GetQuantisedHess(), 0)};
 #endif
  }
  // Load using efficient 128 vector load instruction
@ -124,10 +139,15 @@ class EvaluateSplitAgent {
                                                     evaluator, missing_left, rounding)
                                 : kNullGain;
      // Find thread with best gain
-      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
      // This reduce result is only valid in thread 0
      // broadcast to the rest of the warp
 #if defined(XGBOOST_USE_CUDA)
      auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
 #elif defined(XGBOOST_USE_HIP)
      auto best_thread = __shfl(best.key, 0);
 #endif
      // Best thread updates the split
      if (threadIdx.x == best_thread) {
@ -157,10 +177,15 @@ class EvaluateSplitAgent {
                                 : kNullGain;
      // Find thread with best gain
-      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
      // This reduce result is only valid in thread 0
      // broadcast to the rest of the warp
 #if defined(XGBOOST_USE_CUDA)
      auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
 #elif defined(XGBOOST_USE_HIP)
      auto best_thread = __shfl(best.key, 0);
 #endif
      // Best thread updates the split
      if (threadIdx.x == best_thread) {
        int32_t split_gidx = (scan_begin + threadIdx.x);
@ -186,10 +211,15 @@ class EvaluateSplitAgent {
                    : kNullGain;
    // Find thread with best gain
-    auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+    auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
    // This reduce result is only valid in thread 0
    // broadcast to the rest of the warp
 #if defined(XGBOOST_USE_CUDA)
    auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
 #elif defined(XGBOOST_USE_HIP)
    auto best_thread = __shfl(best.key, 0);
 #endif
    // Best thread updates the split
    if (threadIdx.x == best_thread) {
      assert(thread_active);
@ -391,9 +421,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
  event.Record(dh::DefaultStream());
  for (auto idx : nidx) {
    copy_stream_.View().Wait(event);
 #if defined(XGBOOST_USE_CUDA)
    dh::safe_cuda(cudaMemcpyAsync(
        h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
        d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
 #elif defined(XGBOOST_USE_HIP)
    dh::safe_cuda(hipMemcpyAsync(
        h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
        d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
 #endif
  }
 }
@ -456,8 +493,14 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
  this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
                       dh::ToSpan(out_entries));
  GPUExpandEntry root_entry;
 #if defined(XGBOOST_USE_CUDA)
  dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                cudaMemcpyDeviceToHost));
 #elif defined(XGBOOST_USE_HIP)
  dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                hipMemcpyDeviceToHost));
 #endif
  return root_entry;
 }
--- a/src/tree/gpu_hist/evaluate_splits.hip
+++ b/src/tree/gpu_hist/evaluate_splits.hip
@ -0,0 +1,4 @@
 #if defined(XGBOOST_USE_HIP)
 #include "evaluate_splits.cu"
 #endif
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@ -121,8 +121,10 @@ class TreeEvaluator {
    // Fast floating point division instruction on device
    XGBOOST_DEVICE float Divide(float a, float b) const {
-#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
+#if defined(__CUDA_ARCH__)
      return __fdividef(a, b);
 #elif defined(__HIP_PLATFORM_AMD__)
      return a / b;
 #else
      return a / b;
 #endif
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@ -4,12 +4,26 @@
 #pragma once
 #include <thrust/random.h>
 #include <cstdio>
 #include <limits>
 #include <float.h>
 #if defined(XGBOOST_USE_CUDA)
 #include <cub/cub.cuh>
 #elif defined(XGBOOST_USE_HIP)
 #include <hipcub/hipcub.hpp>
 #endif
 #include <stdexcept>
 #include <string>
 #include <vector>
 #include "../common/categorical.h"
 #if defined(XGBOOST_USE_CUDA)
 #include "../common/device_helpers.cuh"
 #elif defined(XGBOOST_USE_HIP)
 #include "../common/device_helpers.hip.h"
 #endif
 #include "../common/random.h"
 #include "gpu_hist/histogram.cuh"
 #include "param.h"