diff --git a/.gitmodules b/.gitmodules index aeff9610b..291bb25b8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,3 +11,6 @@ [submodule "rocgputreeshap"] path = rocgputreeshap url = https://www.github.com/AMD-AI/rocgputreeshap +[submodule "warp-primitives"] + path = warp-primitives + url = https://github.com/AMD-AI/warp-primitives diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 31e8c16db..eb5756245 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -194,9 +194,11 @@ function(xgboost_set_hip_flags target) if (NOT BUILD_WITH_HIP_CUB) target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1 -DTHRUST_IGNORE_CUB_VERSION_CHECK=1) target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap) + target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include) else () target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_HIP=1) target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/rocgputreeshap) + target_include_directories(${target} PRIVATE ${xgboost_SOURCE_DIR}/warp-primitives/include) endif (NOT BUILD_WITH_HIP_CUB) set_target_properties(${target} PROPERTIES diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh index 78d5f79b5..5eeb5fd5c 100644 --- a/src/data/device_adapter.cuh +++ b/src/data/device_adapter.cuh @@ -9,7 +9,12 @@ #include #include +#if defined(XGBOOST_USE_CUDA) #include "../common/device_helpers.cuh" +#elif defined(XGBOOST_USE_HIP) +#include "../common/device_helpers.hip.h" +#endif + #include "../common/math.h" #include "adapter.h" #include "array_interface.h" @@ -114,7 +119,7 @@ class CudfAdapter : public detail::SingleBatchDataIter { #if defined(XGBOOST_USE_HIP) dh::safe_cuda(hipSetDevice(device_idx_)); -#else +#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_idx_)); #endif @@ -204,7 +209,7 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span offset, #if defined(XGBOOST_USE_HIP) dh::safe_cuda(hipSetDevice(device_idx)); -#else +#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device_idx)); #endif @@ -222,10 +227,11 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span offset, dh::XGBCachingDeviceAllocator alloc; #if defined(XGBOOST_USE_HIP) + size_t row_stride = dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()), thrust::device_pointer_cast(offset.data()) + offset.size(), static_cast(0), thrust::maximum()); -#else +#elif defined(XGBOOST_USE_CUDA) size_t row_stride = dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()), thrust::device_pointer_cast(offset.data()) + offset.size(), diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 35daf701c..2a67fd60e 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -1,6 +1,7 @@ /*! * Copyright 2017-2021 by Contributors */ +#include #include #include #include @@ -13,7 +14,13 @@ #include "../common/bitfield.h" #include "../common/categorical.h" #include "../common/common.h" + +#if defined(XGBOOST_USE_CUDA) #include "../common/device_helpers.cuh" +#elif defined(XGBOOST_USE_HIP) +#include "../common/device_helpers.hip.h" +#endif + #include "../data/device_adapter.cuh" #include "../data/ellpack_page.cuh" #include "../data/proxy_dmatrix.h" @@ -342,7 +349,11 @@ class DeviceModel { int num_group; void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) { +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(gpu_id)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(gpu_id)); +#endif CHECK_EQ(model.param.size_leaf_vector, 0); // Copy decision trees to device @@ -365,12 +376,22 @@ class DeviceModel { for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { auto& src_nodes = model.trees.at(tree_idx)->GetNodes(); auto& src_stats = model.trees.at(tree_idx)->GetStats(); + +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpyAsync( d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(), sizeof(RegTree::Node) * src_nodes.size(), cudaMemcpyDefault)); dh::safe_cuda(cudaMemcpyAsync( d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(), sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipMemcpyAsync( + d_nodes + h_tree_segments[tree_idx - tree_begin], src_nodes.data(), + sizeof(RegTree::Node) * src_nodes.size(), hipMemcpyDefault)); + dh::safe_cuda(hipMemcpyAsync( + d_stats + h_tree_segments[tree_idx - tree_begin], src_stats.data(), + sizeof(RTreeNodeStat) * src_stats.size(), hipMemcpyDefault)); +#endif } tree_group = std::move(HostDeviceVector(model.tree_info.size(), 0, gpu_id)); @@ -490,7 +511,11 @@ void ExtractPaths( dh::device_vector> *paths, DeviceModel *model, dh::device_vector *path_categories, int gpu_id) { +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(gpu_id)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(gpu_id)); +#endif auto& device_model = *model; dh::caching_device_vector info(device_model.nodes.Size()); @@ -513,6 +538,8 @@ void ExtractPaths( } return PathInfo{static_cast(idx), path_length, tree_idx}; }); + +#if defined(XGBOOST_USE_CUDA) auto end = thrust::copy_if( thrust::cuda::par(alloc), nodes_transform, nodes_transform + d_nodes.size(), info.begin(), @@ -525,6 +552,20 @@ void ExtractPaths( thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator, length_iterator + info.size() + 1, path_segments.begin()); +#elif defined(XGBOOST_USE_HIP) + auto end = thrust::copy_if( + thrust::hip::par(alloc), nodes_transform, + nodes_transform + d_nodes.size(), info.begin(), + [=] __device__(const PathInfo& e) { return e.leaf_position != -1; }); + info.resize(end - info.begin()); + auto length_iterator = dh::MakeTransformIterator( + info.begin(), + [=] __device__(const PathInfo& info) { return info.length; }); + dh::caching_device_vector path_segments(info.size() + 1); + thrust::exclusive_scan(thrust::hip::par(alloc), length_iterator, + length_iterator + info.size() + 1, + path_segments.begin()); +#endif paths->resize(path_segments.back()); @@ -550,9 +591,15 @@ void ExtractPaths( thrust::max_element(thrust::device, max_elem_it, max_elem_it + d_cat_node_segments.size()) - max_elem_it; +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy(h_max_cat.data(), d_cat_node_segments.data() + max_cat_it, h_max_cat.size_bytes(), cudaMemcpyDeviceToHost)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipMemcpy(h_max_cat.data(), + d_cat_node_segments.data() + max_cat_it, + h_max_cat.size_bytes(), hipMemcpyDeviceToHost)); +#endif max_cat = h_max_cat[0].size; CHECK_GE(max_cat, 1); path_categories->resize(max_cat * paths->size()); @@ -727,7 +774,11 @@ class GPUPredictor : public xgboost::Predictor { ~GPUPredictor() override { if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) { +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); +#endif } } @@ -823,7 +874,13 @@ class GPUPredictor : public xgboost::Predictor { if (tree_weights != nullptr) { LOG(FATAL) << "Dart booster feature " << not_implemented; } + +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); +#endif + out_contribs->SetDevice(ctx_->gpu_id); if (tree_end == 0 || tree_end > model.trees.size()) { tree_end = static_cast(model.trees.size()); @@ -881,7 +938,13 @@ class GPUPredictor : public xgboost::Predictor { if (tree_weights != nullptr) { LOG(FATAL) << "Dart booster feature " << not_implemented; } + +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); +#endif + out_contribs->SetDevice(ctx_->gpu_id); if (tree_end == 0 || tree_end > model.trees.size()) { tree_end = static_cast(model.trees.size()); @@ -940,7 +1003,11 @@ class GPUPredictor : public xgboost::Predictor { void PredictLeaf(DMatrix *p_fmat, HostDeviceVector *predictions, const gbm::GBTreeModel &model, unsigned tree_end) const override { +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(ctx_->gpu_id)); +#endif auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id); const MetaInfo& info = p_fmat->Info(); diff --git a/src/predictor/gpu_predictor.hip b/src/predictor/gpu_predictor.hip index e69de29bb..33760f6dd 100644 --- a/src/predictor/gpu_predictor.hip +++ b/src/predictor/gpu_predictor.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "gpu_predictor.cu" +#endif diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc index c6ef7fe51..d1918d221 100644 --- a/src/predictor/predictor.cc +++ b/src/predictor/predictor.cc @@ -67,9 +67,9 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector