finish evaluate_splits.cu

This commit is contained in:
amdsc21 2023-03-09 22:15:10 +01:00
parent 1e09c21456
commit f55243fda0
7 changed files with 73 additions and 8 deletions

View File

@ -197,7 +197,7 @@ if (USE_HIP)
find_package(hipcub REQUIRED)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -D__HIP_PLATFORM_AMD__")
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)

View File

@ -74,7 +74,7 @@ class pinned_allocator {
pointer result(nullptr);
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
#else
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
#endif
@ -84,7 +84,7 @@ class pinned_allocator {
inline void deallocate(pointer p, size_type) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipFreeHost(p));
dh::safe_cuda(hipHostFree(p));
#else
dh::safe_cuda(cudaFreeHost(p));
#endif

View File

@ -17,8 +17,10 @@
#include "xgboost/host_device_vector.h"
#include "xgboost/span.h"
#if defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
#if defined (__CUDACC__)
#include "device_helpers.cuh"
#elif defined(__HIP_PLATFORM_AMD__)
#include "device_helpers.hip.h"
#endif // defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
namespace xgboost {

View File

@ -6,12 +6,22 @@
#include <limits>
#include "../../common/categorical.h"
#if defined(XGBOOST_USE_CUDA)
#include "../../common/device_helpers.cuh"
#elif defined(XGBOOST_USE_HIP)
#include "../../common/device_helpers.hip.h"
#endif
#include "../../data/ellpack_page.cuh"
#include "evaluate_splits.cuh"
#include "expand_entry.cuh"
namespace xgboost {
#if defined(XGBOOST_USE_HIP)
namespace cub = hipcub;
#endif
namespace tree {
// With constraints
@ -99,8 +109,13 @@ class EvaluateSplitAgent {
}
local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum); // NOLINT
// Broadcast result from thread 0
#if defined(XGBOOST_USE_CUDA)
return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
__shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
#elif defined(XGBOOST_USE_HIP)
return {__shfl(local_sum.GetQuantisedGrad(), 0),
__shfl(local_sum.GetQuantisedHess(), 0)};
#endif
}
// Load using efficient 128 vector load instruction
@ -124,10 +139,15 @@ class EvaluateSplitAgent {
evaluator, missing_left, rounding)
: kNullGain;
// Find thread with best gain
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
// This reduce result is only valid in thread 0
// broadcast to the rest of the warp
#if defined(XGBOOST_USE_CUDA)
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
#elif defined(XGBOOST_USE_HIP)
auto best_thread = __shfl(best.key, 0);
#endif
// Best thread updates the split
if (threadIdx.x == best_thread) {
@ -157,10 +177,15 @@ class EvaluateSplitAgent {
: kNullGain;
// Find thread with best gain
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
// This reduce result is only valid in thread 0
// broadcast to the rest of the warp
#if defined(XGBOOST_USE_CUDA)
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
#elif defined(XGBOOST_USE_HIP)
auto best_thread = __shfl(best.key, 0);
#endif
// Best thread updates the split
if (threadIdx.x == best_thread) {
int32_t split_gidx = (scan_begin + threadIdx.x);
@ -186,10 +211,15 @@ class EvaluateSplitAgent {
: kNullGain;
// Find thread with best gain
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
// This reduce result is only valid in thread 0
// broadcast to the rest of the warp
#if defined(XGBOOST_USE_CUDA)
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
#elif defined(XGBOOST_USE_HIP)
auto best_thread = __shfl(best.key, 0);
#endif
// Best thread updates the split
if (threadIdx.x == best_thread) {
assert(thread_active);
@ -391,9 +421,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
event.Record(dh::DefaultStream());
for (auto idx : nidx) {
copy_stream_.View().Wait(event);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
#endif
}
}
@ -456,8 +493,14 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
dh::ToSpan(out_entries));
GPUExpandEntry root_entry;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
cudaMemcpyDeviceToHost));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
hipMemcpyDeviceToHost));
#endif
return root_entry;
}

View File

@ -0,0 +1,4 @@
#if defined(XGBOOST_USE_HIP)
#include "evaluate_splits.cu"
#endif

View File

@ -121,8 +121,10 @@ class TreeEvaluator {
// Fast floating point division instruction on device
XGBOOST_DEVICE float Divide(float a, float b) const {
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
#if defined(__CUDA_ARCH__)
return __fdividef(a, b);
#elif defined(__HIP_PLATFORM_AMD__)
return a / b;
#else
return a / b;
#endif

View File

@ -4,12 +4,26 @@
#pragma once
#include <thrust/random.h>
#include <cstdio>
#include <limits>
#include <float.h>
#if defined(XGBOOST_USE_CUDA)
#include <cub/cub.cuh>
#elif defined(XGBOOST_USE_HIP)
#include <hipcub/hipcub.hpp>
#endif
#include <stdexcept>
#include <string>
#include <vector>
#include "../common/categorical.h"
#if defined(XGBOOST_USE_CUDA)
#include "../common/device_helpers.cuh"
#elif defined(XGBOOST_USE_HIP)
#include "../common/device_helpers.hip.h"
#endif
#include "../common/random.h"
#include "gpu_hist/histogram.cuh"
#include "param.h"