finish gpu_predictor.cu
This commit is contained in:
parent
1c58ff61d1
commit
f0febfbcac
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -11,3 +11,6 @@
|
|||||||
[submodule "rocgputreeshap"]
|
[submodule "rocgputreeshap"]
|
||||||
path = rocgputreeshap
|
path = rocgputreeshap
|
||||||
url = https://www.github.com/AMD-AI/rocgputreeshap
|
url = https://www.github.com/AMD-AI/rocgputreeshap
|
||||||
|
[submodule "warp-primitives"]
|
||||||
|
path = warp-primitives
|
||||||
|
url = https://github.com/AMD-AI/warp-primitives
|
||||||
|
|||||||
@ -194,9 +194,11 @@ function(xgboost_set_hip_flags target)
|
|||||||
if (NOT BUILD_WITH_HIP_CUB)
|
if (NOT BUILD_WITH_HIP_CUB)
|
||||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1)
|
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1)
|
||||||
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
|
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
|
||||||
|
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include)
|
||||||
else ()
|
else ()
|
||||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1)
|
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1)
|
||||||
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
|
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap)
|
||||||
|
target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include)
|
||||||
endif (NOT BUILD_WITH_HIP_CUB)
|
endif (NOT BUILD_WITH_HIP_CUB)
|
||||||
|
|
||||||
set_target_properties(${target} PROPERTIES
|
set_target_properties(${target} PROPERTIES
|
||||||
|
|||||||
@ -9,7 +9,12 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
#include "../common/device_helpers.hip.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "../common/math.h"
|
#include "../common/math.h"
|
||||||
#include "adapter.h"
|
#include "adapter.h"
|
||||||
#include "array_interface.h"
|
#include "array_interface.h"
|
||||||
@ -114,7 +119,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
|||||||
|
|
||||||
#if defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_HIP)
|
||||||
dh::safe_cuda(hipSetDevice(device_idx_));
|
dh::safe_cuda(hipSetDevice(device_idx_));
|
||||||
#else
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
dh::safe_cuda(cudaSetDevice(device_idx_));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -204,7 +209,7 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
|
|||||||
|
|
||||||
#if defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_HIP)
|
||||||
dh::safe_cuda(hipSetDevice(device_idx));
|
dh::safe_cuda(hipSetDevice(device_idx));
|
||||||
#else
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -222,10 +227,11 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
|
|||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
size_t row_stride =
|
||||||
dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
|
dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||||
thrust::device_pointer_cast(offset.data()) + offset.size(),
|
thrust::device_pointer_cast(offset.data()) + offset.size(),
|
||||||
static_cast<std::size_t>(0), thrust::maximum<size_t>());
|
static_cast<std::size_t>(0), thrust::maximum<size_t>());
|
||||||
#else
|
#elif defined(XGBOOST_USE_CUDA)
|
||||||
size_t row_stride =
|
size_t row_stride =
|
||||||
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||||
thrust::device_pointer_cast(offset.data()) + offset.size(),
|
thrust::device_pointer_cast(offset.data()) + offset.size(),
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2017-2021 by Contributors
|
* Copyright 2017-2021 by Contributors
|
||||||
*/
|
*/
|
||||||
|
#include <amd_warp_primitives.h>
|
||||||
#include <GPUTreeShap/gpu_treeshap.h>
|
#include <GPUTreeShap/gpu_treeshap.h>
|
||||||
#include <thrust/copy.h>
|
#include <thrust/copy.h>
|
||||||
#include <thrust/device_ptr.h>
|
#include <thrust/device_ptr.h>
|
||||||
@ -13,7 +14,13 @@
|
|||||||
#include "../common/bitfield.h"
|
#include "../common/bitfield.h"
|
||||||
#include "../common/categorical.h"
|
#include "../common/categorical.h"
|
||||||
#include "../common/common.h"
|
#include "../common/common.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
#include "../common/device_helpers.hip.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "../data/device_adapter.cuh"
|
#include "../data/device_adapter.cuh"
|
||||||
#include "../data/ellpack_page.cuh"
|
#include "../data/ellpack_page.cuh"
|
||||||
#include "../data/proxy_dmatrix.h"
|
#include "../data/proxy_dmatrix.h"
|
||||||
@ -342,7 +349,11 @@ class DeviceModel {
|
|||||||
int num_group;
|
int num_group;
|
||||||
|
|
||||||
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
|
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(gpu_id));
|
dh::safe_cuda(cudaSetDevice(gpu_id));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(gpu_id));
|
||||||
|
#endif
|
||||||
|
|
||||||
CHECK_EQ(model.param.size_leaf_vector, 0);
|
CHECK_EQ(model.param.size_leaf_vector, 0);
|
||||||
// Copy decision trees to device
|
// Copy decision trees to device
|
||||||
@ -365,12 +376,22 @@ class DeviceModel {
|
|||||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||||
auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
|
auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
|
||||||
auto& src_stats = model.trees.at(tree_idx)->GetStats();
|
auto& src_stats = model.trees.at(tree_idx)->GetStats();
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaMemcpyAsync(
|
dh::safe_cuda(cudaMemcpyAsync(
|
||||||
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
|
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
|
||||||
sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault));
|
sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault));
|
||||||
dh::safe_cuda(cudaMemcpyAsync(
|
dh::safe_cuda(cudaMemcpyAsync(
|
||||||
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
|
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
|
||||||
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
|
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipMemcpyAsync(
|
||||||
|
d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(),
|
||||||
|
sizeof(RegTree::Node) * src_nodes.size(), hipMemcpyDefault));
|
||||||
|
dh::safe_cuda(hipMemcpyAsync(
|
||||||
|
d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(),
|
||||||
|
sizeof(RTreeNodeStat) * src_stats.size(), hipMemcpyDefault));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
tree_group = std::move(HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id));
|
tree_group = std::move(HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id));
|
||||||
@ -490,7 +511,11 @@ void ExtractPaths(
|
|||||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
|
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
|
||||||
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
|
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
|
||||||
int gpu_id) {
|
int gpu_id) {
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(gpu_id));
|
dh::safe_cuda(cudaSetDevice(gpu_id));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(gpu_id));
|
||||||
|
#endif
|
||||||
auto& device_model = *model;
|
auto& device_model = *model;
|
||||||
|
|
||||||
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
|
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
|
||||||
@ -513,6 +538,8 @@ void ExtractPaths(
|
|||||||
}
|
}
|
||||||
return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
|
return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
auto end = thrust::copy_if(
|
auto end = thrust::copy_if(
|
||||||
thrust::cuda::par(alloc), nodes_transform,
|
thrust::cuda::par(alloc), nodes_transform,
|
||||||
nodes_transform + d_nodes.size(), info.begin(),
|
nodes_transform + d_nodes.size(), info.begin(),
|
||||||
@ -525,6 +552,20 @@ void ExtractPaths(
|
|||||||
thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator,
|
thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator,
|
||||||
length_iterator + info.size() + 1,
|
length_iterator + info.size() + 1,
|
||||||
path_segments.begin());
|
path_segments.begin());
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
auto end = thrust::copy_if(
|
||||||
|
thrust::hip::par(alloc), nodes_transform,
|
||||||
|
nodes_transform + d_nodes.size(), info.begin(),
|
||||||
|
[=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
|
||||||
|
info.resize(end - info.begin());
|
||||||
|
auto length_iterator = dh::MakeTransformIterator<size_t>(
|
||||||
|
info.begin(),
|
||||||
|
[=] __device__(const PathInfo& info) { return info.length; });
|
||||||
|
dh::caching_device_vector<size_t> path_segments(info.size() + 1);
|
||||||
|
thrust::exclusive_scan(thrust::hip::par(alloc), length_iterator,
|
||||||
|
length_iterator + info.size() + 1,
|
||||||
|
path_segments.begin());
|
||||||
|
#endif
|
||||||
|
|
||||||
paths->resize(path_segments.back());
|
paths->resize(path_segments.back());
|
||||||
|
|
||||||
@ -550,9 +591,15 @@ void ExtractPaths(
|
|||||||
thrust::max_element(thrust::device, max_elem_it,
|
thrust::max_element(thrust::device, max_elem_it,
|
||||||
max_elem_it + d_cat_node_segments.size()) -
|
max_elem_it + d_cat_node_segments.size()) -
|
||||||
max_elem_it;
|
max_elem_it;
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
|
dh::safe_cuda(cudaMemcpy(h_max_cat.data(),
|
||||||
d_cat_node_segments.data() + max_cat_it,
|
d_cat_node_segments.data() + max_cat_it,
|
||||||
h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
|
h_max_cat.size_bytes(), cudaMemcpyDeviceToHost));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipMemcpy(h_max_cat.data(),
|
||||||
|
d_cat_node_segments.data() + max_cat_it,
|
||||||
|
h_max_cat.size_bytes(), hipMemcpyDeviceToHost));
|
||||||
|
#endif
|
||||||
max_cat = h_max_cat[0].size;
|
max_cat = h_max_cat[0].size;
|
||||||
CHECK_GE(max_cat, 1);
|
CHECK_GE(max_cat, 1);
|
||||||
path_categories->resize(max_cat * paths->size());
|
path_categories->resize(max_cat * paths->size());
|
||||||
@ -727,7 +774,11 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
|
|
||||||
~GPUPredictor() override {
|
~GPUPredictor() override {
|
||||||
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
|
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -823,7 +874,13 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (tree_weights != nullptr) {
|
if (tree_weights != nullptr) {
|
||||||
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||||
|
#endif
|
||||||
|
|
||||||
out_contribs->SetDevice(ctx_->gpu_id);
|
out_contribs->SetDevice(ctx_->gpu_id);
|
||||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||||
@ -881,7 +938,13 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (tree_weights != nullptr) {
|
if (tree_weights != nullptr) {
|
||||||
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||||
|
#endif
|
||||||
|
|
||||||
out_contribs->SetDevice(ctx_->gpu_id);
|
out_contribs->SetDevice(ctx_->gpu_id);
|
||||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||||
@ -940,7 +1003,11 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
|
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
|
||||||
const gbm::GBTreeModel &model,
|
const gbm::GBTreeModel &model,
|
||||||
unsigned tree_end) const override {
|
unsigned tree_end) const override {
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||||
|
#elif defined(XGBOOST_USE_HIP)
|
||||||
|
dh::safe_cuda(hipSetDevice(ctx_->gpu_id));
|
||||||
|
#endif
|
||||||
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
|
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
|
||||||
|
|
||||||
const MetaInfo& info = p_fmat->Info();
|
const MetaInfo& info = p_fmat->Info();
|
||||||
|
|||||||
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
#include "gpu_predictor.cu"
|
||||||
|
#endif
|
||||||
@ -67,9 +67,9 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace predictor {
|
namespace predictor {
|
||||||
// List of files that will be force linked in static links.
|
// List of files that will be force linked in static links.
|
||||||
#ifdef XGBOOST_USE_CUDA
|
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||||
DMLC_REGISTRY_LINK_TAG(gpu_predictor);
|
DMLC_REGISTRY_LINK_TAG(gpu_predictor);
|
||||||
#endif // XGBOOST_USE_CUDA
|
#endif // XGBOOST_USE_CUDA || defined(XGBOOST_USE_HIP)
|
||||||
DMLC_REGISTRY_LINK_TAG(cpu_predictor);
|
DMLC_REGISTRY_LINK_TAG(cpu_predictor);
|
||||||
} // namespace predictor
|
} // namespace predictor
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
1
warp-primitives
Submodule
1
warp-primitives
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit d8d1bb6fff784e3c30f42d22d1fe09ca18c4c2e7
|
||||||
Loading…
x
Reference in New Issue
Block a user