From f55243fda0af0b0d42a9eba330a8b841580d7268 Mon Sep 17 00:00:00 2001 From: amdsc21 <96135754+amdsc21@users.noreply.github.com> Date: Thu, 9 Mar 2023 22:15:10 +0100 Subject: [PATCH] finish evaluate_splits.cu --- CMakeLists.txt | 2 +- src/common/cuda_pinned_allocator.h | 4 +-- src/common/transform.h | 4 ++- src/tree/gpu_hist/evaluate_splits.cu | 49 +++++++++++++++++++++++++-- src/tree/gpu_hist/evaluate_splits.hip | 4 +++ src/tree/split_evaluator.h | 4 ++- src/tree/updater_gpu_common.cuh | 14 ++++++++ 7 files changed, 73 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index df520dff4..fa26a1aba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,7 +197,7 @@ if (USE_HIP) find_package(hipcub REQUIRED) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip") - set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result") + set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w") set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -D__HIP_PLATFORM_AMD__") add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap) diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h index a5152c8a0..11a942de3 100644 --- a/src/common/cuda_pinned_allocator.h +++ b/src/common/cuda_pinned_allocator.h @@ -74,7 +74,7 @@ class pinned_allocator { pointer result(nullptr); #if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipMallocHost(reinterpret_cast(&result), cnt * sizeof(value_type))); + dh::safe_cuda(hipHostMalloc(reinterpret_cast(&result), cnt * sizeof(value_type))); #else dh::safe_cuda(cudaMallocHost(reinterpret_cast(&result), cnt * sizeof(value_type))); #endif @@ -84,7 +84,7 @@ class pinned_allocator { inline void deallocate(pointer p, size_type) { #if defined(XGBOOST_USE_HIP) - dh::safe_cuda(hipFreeHost(p)); + dh::safe_cuda(hipHostFree(p)); #else dh::safe_cuda(cudaFreeHost(p)); #endif diff --git a/src/common/transform.h b/src/common/transform.h index 974ee86d6..389ff7f6e 100644 --- a/src/common/transform.h +++ b/src/common/transform.h @@ -17,8 +17,10 @@ #include "xgboost/host_device_vector.h" #include "xgboost/span.h" -#if defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__) +#if defined (__CUDACC__) #include "device_helpers.cuh" +#elif defined(__HIP_PLATFORM_AMD__) +#include "device_helpers.hip.h" #endif // defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__) namespace xgboost { diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index c48c8ddf3..b898a8642 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -6,12 +6,22 @@ #include #include "../../common/categorical.h" + +#if defined(XGBOOST_USE_CUDA) #include "../../common/device_helpers.cuh" +#elif defined(XGBOOST_USE_HIP) +#include "../../common/device_helpers.hip.h" +#endif + #include "../../data/ellpack_page.cuh" #include "evaluate_splits.cuh" #include "expand_entry.cuh" namespace xgboost { +#if defined(XGBOOST_USE_HIP) +namespace cub = hipcub; +#endif + namespace tree { // With constraints @@ -99,8 +109,13 @@ class EvaluateSplitAgent { } local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum); // NOLINT // Broadcast result from thread 0 +#if defined(XGBOOST_USE_CUDA) return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0), __shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)}; +#elif defined(XGBOOST_USE_HIP) + return {__shfl(local_sum.GetQuantisedGrad(), 0), + __shfl(local_sum.GetQuantisedHess(), 0)}; +#endif } // Load using efficient 128 vector load instruction @@ -124,10 +139,15 @@ class EvaluateSplitAgent { evaluator, missing_left, rounding) : kNullGain; // Find thread with best gain - auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax()); + // This reduce result is only valid in thread 0 // broadcast to the rest of the warp +#if defined(XGBOOST_USE_CUDA) auto best_thread = __shfl_sync(0xffffffff, best.key, 0); +#elif defined(XGBOOST_USE_HIP) + auto best_thread = __shfl(best.key, 0); +#endif // Best thread updates the split if (threadIdx.x == best_thread) { @@ -157,10 +177,15 @@ class EvaluateSplitAgent { : kNullGain; // Find thread with best gain - auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax()); // This reduce result is only valid in thread 0 // broadcast to the rest of the warp +#if defined(XGBOOST_USE_CUDA) auto best_thread = __shfl_sync(0xffffffff, best.key, 0); +#elif defined(XGBOOST_USE_HIP) + auto best_thread = __shfl(best.key, 0); +#endif + // Best thread updates the split if (threadIdx.x == best_thread) { int32_t split_gidx = (scan_begin + threadIdx.x); @@ -186,10 +211,15 @@ class EvaluateSplitAgent { : kNullGain; // Find thread with best gain - auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax()); // This reduce result is only valid in thread 0 // broadcast to the rest of the warp +#if defined(XGBOOST_USE_CUDA) auto best_thread = __shfl_sync(0xffffffff, best.key, 0); +#elif defined(XGBOOST_USE_HIP) + auto best_thread = __shfl(best.key, 0); +#endif + // Best thread updates the split if (threadIdx.x == best_thread) { assert(thread_active); @@ -391,9 +421,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector &nidx) { event.Record(dh::DefaultStream()); for (auto idx : nidx) { copy_stream_.View().Wait(event); + +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync( h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View())); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipMemcpyAsync( + h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(), + d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View())); +#endif } } @@ -456,8 +493,14 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs, dh::ToSpan(out_entries)); GPUExpandEntry root_entry; + +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry), cudaMemcpyDeviceToHost)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry), + hipMemcpyDeviceToHost)); +#endif return root_entry; } diff --git a/src/tree/gpu_hist/evaluate_splits.hip b/src/tree/gpu_hist/evaluate_splits.hip index e69de29bb..4469d1c1f 100644 --- a/src/tree/gpu_hist/evaluate_splits.hip +++ b/src/tree/gpu_hist/evaluate_splits.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "evaluate_splits.cu" +#endif diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h index b6625339d..4ca90b481 100644 --- a/src/tree/split_evaluator.h +++ b/src/tree/split_evaluator.h @@ -121,8 +121,10 @@ class TreeEvaluator { // Fast floating point division instruction on device XGBOOST_DEVICE float Divide(float a, float b) const { -#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__) +#if defined(__CUDA_ARCH__) return __fdividef(a, b); +#elif defined(__HIP_PLATFORM_AMD__) + return a / b; #else return a / b; #endif diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index 1637300b6..8e15e90bb 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -4,12 +4,26 @@ #pragma once #include #include +#include +#include + +#if defined(XGBOOST_USE_CUDA) #include +#elif defined(XGBOOST_USE_HIP) +#include +#endif + #include #include #include #include "../common/categorical.h" + +#if defined(XGBOOST_USE_CUDA) #include "../common/device_helpers.cuh" +#elif defined(XGBOOST_USE_HIP) +#include "../common/device_helpers.hip.h" +#endif + #include "../common/random.h" #include "gpu_hist/histogram.cuh" #include "param.h"