finish evaluate_splits.cu
This commit is contained in:
parent
1e09c21456
commit
f55243fda0
@ -197,7 +197,7 @@ if (USE_HIP)
|
||||
find_package(hipcub REQUIRED)
|
||||
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result")
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -D__HIP_PLATFORM_AMD__")
|
||||
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
|
||||
|
||||
|
||||
@ -74,7 +74,7 @@ class pinned_allocator {
|
||||
pointer result(nullptr);
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
||||
dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
||||
#else
|
||||
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
||||
#endif
|
||||
@ -84,7 +84,7 @@ class pinned_allocator {
|
||||
|
||||
inline void deallocate(pointer p, size_type) {
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipFreeHost(p));
|
||||
dh::safe_cuda(hipHostFree(p));
|
||||
#else
|
||||
dh::safe_cuda(cudaFreeHost(p));
|
||||
#endif
|
||||
|
||||
@ -17,8 +17,10 @@
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
#if defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined (__CUDACC__)
|
||||
#include "device_helpers.cuh"
|
||||
#elif defined(__HIP_PLATFORM_AMD__)
|
||||
#include "device_helpers.hip.h"
|
||||
#endif // defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@ -6,12 +6,22 @@
|
||||
#include <limits>
|
||||
|
||||
#include "../../common/categorical.h"
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#include "../../common/device_helpers.cuh"
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
#include "../../common/device_helpers.hip.h"
|
||||
#endif
|
||||
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
#include "evaluate_splits.cuh"
|
||||
#include "expand_entry.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
namespace cub = hipcub;
|
||||
#endif
|
||||
|
||||
namespace tree {
|
||||
|
||||
// With constraints
|
||||
@ -99,8 +109,13 @@ class EvaluateSplitAgent {
|
||||
}
|
||||
local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum); // NOLINT
|
||||
// Broadcast result from thread 0
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
|
||||
__shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
return {__shfl(local_sum.GetQuantisedGrad(), 0),
|
||||
__shfl(local_sum.GetQuantisedHess(), 0)};
|
||||
#endif
|
||||
}
|
||||
|
||||
// Load using efficient 128 vector load instruction
|
||||
@ -124,10 +139,15 @@ class EvaluateSplitAgent {
|
||||
evaluator, missing_left, rounding)
|
||||
: kNullGain;
|
||||
// Find thread with best gain
|
||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
|
||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
|
||||
|
||||
// This reduce result is only valid in thread 0
|
||||
// broadcast to the rest of the warp
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
auto best_thread = __shfl(best.key, 0);
|
||||
#endif
|
||||
|
||||
// Best thread updates the split
|
||||
if (threadIdx.x == best_thread) {
|
||||
@ -157,10 +177,15 @@ class EvaluateSplitAgent {
|
||||
: kNullGain;
|
||||
|
||||
// Find thread with best gain
|
||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
|
||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
|
||||
// This reduce result is only valid in thread 0
|
||||
// broadcast to the rest of the warp
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
auto best_thread = __shfl(best.key, 0);
|
||||
#endif
|
||||
|
||||
// Best thread updates the split
|
||||
if (threadIdx.x == best_thread) {
|
||||
int32_t split_gidx = (scan_begin + threadIdx.x);
|
||||
@ -186,10 +211,15 @@ class EvaluateSplitAgent {
|
||||
: kNullGain;
|
||||
|
||||
// Find thread with best gain
|
||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
|
||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
|
||||
// This reduce result is only valid in thread 0
|
||||
// broadcast to the rest of the warp
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
auto best_thread = __shfl(best.key, 0);
|
||||
#endif
|
||||
|
||||
// Best thread updates the split
|
||||
if (threadIdx.x == best_thread) {
|
||||
assert(thread_active);
|
||||
@ -391,9 +421,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
|
||||
event.Record(dh::DefaultStream());
|
||||
for (auto idx : nidx) {
|
||||
copy_stream_.View().Wait(event);
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(
|
||||
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||
d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -456,8 +493,14 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
|
||||
this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
|
||||
dh::ToSpan(out_entries));
|
||||
GPUExpandEntry root_entry;
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||
cudaMemcpyDeviceToHost));
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||
hipMemcpyDeviceToHost));
|
||||
#endif
|
||||
return root_entry;
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
#include "evaluate_splits.cu"
|
||||
#endif
|
||||
@ -121,8 +121,10 @@ class TreeEvaluator {
|
||||
|
||||
// Fast floating point division instruction on device
|
||||
XGBOOST_DEVICE float Divide(float a, float b) const {
|
||||
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__CUDA_ARCH__)
|
||||
return __fdividef(a, b);
|
||||
#elif defined(__HIP_PLATFORM_AMD__)
|
||||
return a / b;
|
||||
#else
|
||||
return a / b;
|
||||
#endif
|
||||
|
||||
@ -4,12 +4,26 @@
|
||||
#pragma once
|
||||
#include <thrust/random.h>
|
||||
#include <cstdio>
|
||||
#include <limits>
|
||||
#include <float.h>
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#include <cub/cub.cuh>
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
#include <hipcub/hipcub.hpp>
|
||||
#endif
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "../common/categorical.h"
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
#include "../common/device_helpers.cuh"
|
||||
#elif defined(XGBOOST_USE_HIP)
|
||||
#include "../common/device_helpers.hip.h"
|
||||
#endif
|
||||
|
||||
#include "../common/random.h"
|
||||
#include "gpu_hist/histogram.cuh"
|
||||
#include "param.h"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user