finish evaluate_splits.cu
This commit is contained in:
parent
1e09c21456
commit
f55243fda0
@ -197,7 +197,7 @@ if (USE_HIP)
|
|||||||
find_package(hipcub REQUIRED)
|
find_package(hipcub REQUIRED)
|
||||||
|
|
||||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
|
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -I${HIP_INCLUDE_DIRS} -I${HIP_INCLUDE_DIRS}/hip")
|
||||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result")
|
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
|
||||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -D__HIP_PLATFORM_AMD__")
|
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -D__HIP_PLATFORM_AMD__")
|
||||||
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
|
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,7 @@ class pinned_allocator {
|
|||||||
pointer result(nullptr);
|
pointer result(nullptr);
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_HIP)
|
||||||
dh::safe_cuda(hipMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
dh::safe_cuda(hipHostMalloc(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
||||||
#else
|
#else
|
||||||
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
|
||||||
#endif
|
#endif
|
||||||
@ -84,7 +84,7 @@ class pinned_allocator {
|
|||||||
|
|
||||||
inline void deallocate(pointer p, size_type) {
|
inline void deallocate(pointer p, size_type) {
|
||||||
#if defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_HIP)
|
||||||
dh::safe_cuda(hipFreeHost(p));
|
dh::safe_cuda(hipHostFree(p));
|
||||||
#else
|
#else
|
||||||
dh::safe_cuda(cudaFreeHost(p));
|
dh::safe_cuda(cudaFreeHost(p));
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -17,8 +17,10 @@
|
|||||||
#include "xgboost/host_device_vector.h"
|
#include "xgboost/host_device_vector.h"
|
||||||
#include "xgboost/span.h"
|
#include "xgboost/span.h"
|
||||||
|
|
||||||
#if defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
|
#if defined (__CUDACC__)
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
|
#elif defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#include "device_helpers.hip.h"
|
||||||
#endif // defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
|
#endif // defined (__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|||||||
@ -6,12 +6,22 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
#include "../../common/categorical.h"
|
#include "../../common/categorical.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
#include "../../common/device_helpers.cuh"
|
#include "../../common/device_helpers.cuh"
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
#include "../../common/device_helpers.hip.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "../../data/ellpack_page.cuh"
|
#include "../../data/ellpack_page.cuh"
|
||||||
#include "evaluate_splits.cuh"
|
#include "evaluate_splits.cuh"
|
||||||
#include "expand_entry.cuh"
|
#include "expand_entry.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace cub = hipcub;
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace tree {
|
namespace tree {
|
||||||
|
|
||||||
// With constraints
|
// With constraints
|
||||||
@ -99,8 +109,13 @@ class EvaluateSplitAgent {
|
|||||||
}
|
}
|
||||||
local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum); // NOLINT
|
local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum); // NOLINT
|
||||||
// Broadcast result from thread 0
|
// Broadcast result from thread 0
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
|
return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
|
||||||
__shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
|
__shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
return {__shfl(local_sum.GetQuantisedGrad(), 0),
|
||||||
|
__shfl(local_sum.GetQuantisedHess(), 0)};
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load using efficient 128 vector load instruction
|
// Load using efficient 128 vector load instruction
|
||||||
@ -124,10 +139,15 @@ class EvaluateSplitAgent {
|
|||||||
evaluator, missing_left, rounding)
|
evaluator, missing_left, rounding)
|
||||||
: kNullGain;
|
: kNullGain;
|
||||||
// Find thread with best gain
|
// Find thread with best gain
|
||||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
|
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
|
||||||
|
|
||||||
// This reduce result is only valid in thread 0
|
// This reduce result is only valid in thread 0
|
||||||
// broadcast to the rest of the warp
|
// broadcast to the rest of the warp
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
auto best_thread = __shfl(best.key, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Best thread updates the split
|
// Best thread updates the split
|
||||||
if (threadIdx.x == best_thread) {
|
if (threadIdx.x == best_thread) {
|
||||||
@ -157,10 +177,15 @@ class EvaluateSplitAgent {
|
|||||||
: kNullGain;
|
: kNullGain;
|
||||||
|
|
||||||
// Find thread with best gain
|
// Find thread with best gain
|
||||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
|
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
|
||||||
// This reduce result is only valid in thread 0
|
// This reduce result is only valid in thread 0
|
||||||
// broadcast to the rest of the warp
|
// broadcast to the rest of the warp
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
auto best_thread = __shfl(best.key, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Best thread updates the split
|
// Best thread updates the split
|
||||||
if (threadIdx.x == best_thread) {
|
if (threadIdx.x == best_thread) {
|
||||||
int32_t split_gidx = (scan_begin + threadIdx.x);
|
int32_t split_gidx = (scan_begin + threadIdx.x);
|
||||||
@ -186,10 +211,15 @@ class EvaluateSplitAgent {
|
|||||||
: kNullGain;
|
: kNullGain;
|
||||||
|
|
||||||
// Find thread with best gain
|
// Find thread with best gain
|
||||||
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
|
auto best = MaxReduceT(temp_storage->max_reduce).Reduce({(int)threadIdx.x, gain}, cub::ArgMax());
|
||||||
// This reduce result is only valid in thread 0
|
// This reduce result is only valid in thread 0
|
||||||
// broadcast to the rest of the warp
|
// broadcast to the rest of the warp
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
auto best_thread = __shfl(best.key, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Best thread updates the split
|
// Best thread updates the split
|
||||||
if (threadIdx.x == best_thread) {
|
if (threadIdx.x == best_thread) {
|
||||||
assert(thread_active);
|
assert(thread_active);
|
||||||
@ -391,9 +421,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
|
|||||||
event.Record(dh::DefaultStream());
|
event.Record(dh::DefaultStream());
|
||||||
for (auto idx : nidx) {
|
for (auto idx : nidx) {
|
||||||
copy_stream_.View().Wait(event);
|
copy_stream_.View().Wait(event);
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaMemcpyAsync(
|
dh::safe_cuda(cudaMemcpyAsync(
|
||||||
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||||
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
|
d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View()));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipMemcpyAsync(
|
||||||
|
h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(),
|
||||||
|
d_cats.GetNodeCatStorage(idx).size_bytes(), hipMemcpyDeviceToHost, copy_stream_.View()));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -456,8 +493,14 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
|
|||||||
this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
|
this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
|
||||||
dh::ToSpan(out_entries));
|
dh::ToSpan(out_entries));
|
||||||
GPUExpandEntry root_entry;
|
GPUExpandEntry root_entry;
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||||
cudaMemcpyDeviceToHost));
|
cudaMemcpyDeviceToHost));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
|
||||||
|
hipMemcpyDeviceToHost));
|
||||||
|
#endif
|
||||||
return root_entry;
|
return root_entry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "evaluate_splits.cu"
|
||||||
|
#endif
|
||||||
@ -121,8 +121,10 @@ class TreeEvaluator {
|
|||||||
|
|
||||||
// Fast floating point division instruction on device
|
// Fast floating point division instruction on device
|
||||||
XGBOOST_DEVICE float Divide(float a, float b) const {
|
XGBOOST_DEVICE float Divide(float a, float b) const {
|
||||||
#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__)
|
#if defined(__CUDA_ARCH__)
|
||||||
return __fdividef(a, b);
|
return __fdividef(a, b);
|
||||||
|
#elif defined(__HIP_PLATFORM_AMD__)
|
||||||
|
return a / b;
|
||||||
#else
|
#else
|
||||||
return a / b;
|
return a / b;
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -4,12 +4,26 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <thrust/random.h>
|
#include <thrust/random.h>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <limits>
|
||||||
|
#include <float.h>
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
#include <hipcub/hipcub.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "../common/categorical.h"
|
#include "../common/categorical.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
#include "../common/device_helpers.hip.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "../common/random.h"
|
#include "../common/random.h"
|
||||||
#include "gpu_hist/histogram.cuh"
|
#include "gpu_hist/histogram.cuh"
|
||||||
#include "param.h"
|
#include "param.h"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user