From 41407850d5e3ab173a5ea30b343f17c3e1161b53 Mon Sep 17 00:00:00 2001 From: amdsc21 <96135754+amdsc21@users.noreply.github.com> Date: Fri, 10 Mar 2023 06:29:08 +0100 Subject: [PATCH] finish rank_obj.cu --- src/objective/rank_obj.cc | 4 +- src/objective/rank_obj.cu | 87 ++++++++++++++++++++++++++++++++------ src/objective/rank_obj.hip | 4 ++ 3 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/objective/rank_obj.cc b/src/objective/rank_obj.cc index 25cd9e643..61b53a976 100644 --- a/src/objective/rank_obj.cc +++ b/src/objective/rank_obj.cc @@ -12,6 +12,6 @@ DMLC_REGISTRY_FILE_TAG(rank_obj); } // namespace obj } // namespace xgboost -#ifndef XGBOOST_USE_CUDA +#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) #include "rank_obj.cu" -#endif // XGBOOST_USE_CUDA +#endif // XGBOOST_USE_CUDA && XGBOOST_USE_HIP diff --git a/src/objective/rank_obj.cu b/src/objective/rank_obj.cu index f1c870210..805870aac 100644 --- a/src/objective/rank_obj.cu +++ b/src/objective/rank_obj.cu @@ -25,12 +25,23 @@ #include #include "../common/device_helpers.cuh" +#elif defined(__HIP_PLATFORM_AMD__) + +#include +#include +#include +#include +#include + +#include + +#include "../common/device_helpers.hip.h" #endif namespace xgboost { namespace obj { -#if defined(XGBOOST_USE_CUDA) && !defined(GTEST_TEST) +#if (defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)) && !defined(GTEST_TEST) DMLC_REGISTRY_FILE_TAG(rank_obj_gpu); #endif // defined(XGBOOST_USE_CUDA) @@ -47,7 +58,7 @@ struct LambdaRankParam : public XGBoostParameter { } }; -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) // Helper functions template @@ -118,7 +129,7 @@ class PairwiseLambdaWeightComputer { return "rank:pairwise"; } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) PairwiseLambdaWeightComputer(const bst_float*, const bst_float*, const dh::SegmentSorter&) {} @@ -137,7 +148,7 @@ class PairwiseLambdaWeightComputer { #endif }; -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) class BaseLambdaWeightMultiplier { public: BaseLambdaWeightMultiplier(const dh::SegmentSorter &segment_label_sorter, @@ -209,12 +220,12 @@ class IndexablePredictionSorter { // beta version: NDCG lambda rank class NDCGLambdaWeightComputer -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) : public IndexablePredictionSorter #endif { public: -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) // This function object computes the item's DCG value class ComputeItemDCG : public thrust::unary_function { public: @@ -281,6 +292,7 @@ class NDCGLambdaWeightComputer dh::XGBCachingDeviceAllocator alloc; // Compute each elements DCG values and reduce them across groups concurrently. +#if defined(XGBOOST_USE_CUDA) auto end_range = thrust::reduce_by_key(thrust::cuda::par(alloc), dh::tcbegin(group_segments), dh::tcend(group_segments), @@ -293,6 +305,20 @@ class NDCGLambdaWeightComputer group_segments)), thrust::make_discard_iterator(), // We don't care for the group indices dgroup_dcg_.begin()); // Sum of the item's DCG values in the group +#elif defined(XGBOOST_USE_HIP) + auto end_range = + thrust::reduce_by_key(thrust::hip::par(alloc), + dh::tcbegin(group_segments), dh::tcend(group_segments), + thrust::make_transform_iterator( + // The indices need not be sequential within a group, as we care only + // about the sum of items DCG values within a group + dh::tcbegin(segment_label_sorter.GetOriginalPositionsSpan()), + ComputeItemDCG(segment_label_sorter.GetItemsSpan(), + segment_label_sorter.GetGroupsSpan(), + group_segments)), + thrust::make_discard_iterator(), // We don't care for the group indices + dgroup_dcg_.begin()); // Sum of the item's DCG values in the group +#endif CHECK_EQ(static_cast(end_range.second - dgroup_dcg_.begin()), dgroup_dcg_.size()); } @@ -368,7 +394,7 @@ class NDCGLambdaWeightComputer return delta; } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) dh::caching_device_vector dgroup_dcg_; // This computes the adjustment to the weight const NDCGLambdaWeightMultiplier weight_multiplier_; @@ -376,7 +402,7 @@ class NDCGLambdaWeightComputer }; class MAPLambdaWeightComputer -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) : public IndexablePredictionSorter #endif { @@ -417,7 +443,7 @@ class MAPLambdaWeightComputer private: template XGBOOST_DEVICE inline static void Swap(T &v0, T &v1) { -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) thrust::swap(v0, v1); #else std::swap(v0, v1); @@ -504,7 +530,7 @@ class MAPLambdaWeightComputer } } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) MAPLambdaWeightComputer(const bst_float *dpreds, const bst_float *dlabels, const dh::SegmentSorter &segment_label_sorter) @@ -545,10 +571,17 @@ class MAPLambdaWeightComputer // This is required for computing the accumulated precisions const auto &group_segments = segment_label_sorter.GetGroupSegmentsSpan(); // Data segmented into different groups... +#if defined(XGBOOST_USE_CUDA) thrust::inclusive_scan_by_key(thrust::cuda::par(alloc), dh::tcbegin(group_segments), dh::tcend(group_segments), dhits.begin(), // Input value dhits.begin()); // In-place scan +#elif defined(XGBOOST_USE_HIP) + thrust::inclusive_scan_by_key(thrust::hip::par(alloc), + dh::tcbegin(group_segments), dh::tcend(group_segments), + dhits.begin(), // Input value + dhits.begin()); // In-place scan +#endif // Compute accumulated precisions for each item, assuming positive and // negative instances are missing. @@ -574,10 +607,17 @@ class MAPLambdaWeightComputer // Lastly, compute the accumulated precisions for all the items segmented by groups. // The precisions are accumulated within each group +#if defined(XGBOOST_USE_CUDA) thrust::inclusive_scan_by_key(thrust::cuda::par(alloc), dh::tcbegin(group_segments), dh::tcend(group_segments), this->dmap_stats_.begin(), // Input map stats this->dmap_stats_.begin()); // In-place scan and output here +#elif defined(XGBOOST_USE_HIP) + thrust::inclusive_scan_by_key(thrust::hip::par(alloc), + dh::tcbegin(group_segments), dh::tcend(group_segments), + this->dmap_stats_.begin(), // Input map stats + this->dmap_stats_.begin()); // In-place scan and output here +#endif } inline const common::Span GetMapStatsSpan() const { @@ -625,7 +665,7 @@ class MAPLambdaWeightComputer #endif }; -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) class SortedLabelList : dh::SegmentSorter { private: const LambdaRankParam ¶m_; // Objective configuration @@ -670,7 +710,13 @@ class SortedLabelList : dh::SegmentSorter { auto wmultiplier = weight_computer.GetWeightMultiplier(); int device_id = -1; + +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaGetDevice(&device_id)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipGetDevice(&device_id)); +#endif + // For each instance in the group, compute the gradient pair concurrently dh::LaunchN(niter, nullptr, [=] __device__(uint32_t idx) { // First, determine the group 'idx' belongs to @@ -723,7 +769,12 @@ class SortedLabelList : dh::SegmentSorter { bst_float h = thrust::max(p * (1.0f - p), eps); // Rescale each gradient and hessian so that the group has a weighted constant +#if defined(XGBOOST_USE_CUDA) float scale = __frcp_ru(niter / total_items); +#elif defined(XGBOOST_USE_HIP) + float scale = __frcp_rn(niter / total_items); +#endif + if (fix_list_weight != 0.0f) { scale *= fix_list_weight / total_group_items; } @@ -741,7 +792,11 @@ class SortedLabelList : dh::SegmentSorter { }); // Wait until the computations done by the kernel is complete +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaStreamSynchronize(nullptr)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipStreamSynchronize(nullptr)); +#endif } }; #endif @@ -768,7 +823,7 @@ class LambdaRankObj : public ObjFunction { << "labels size: " << info.labels.Size() << ", " << "group pointer back: " << (gptr.size() == 0 ? 0 : gptr.back()); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) // Check if we have a GPU assignment; else, revert back to CPU auto device = ctx_->gpu_id; if (device >= 0) { @@ -777,7 +832,7 @@ class LambdaRankObj : public ObjFunction { // Revert back to CPU #endif ComputeGradientsOnCPU(preds, info, iter, out_gpair, gptr); -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) } #endif } @@ -898,7 +953,7 @@ class LambdaRankObj : public ObjFunction { exc.Rethrow(); } -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) void ComputeGradientsOnGPU(const HostDeviceVector& preds, const MetaInfo& info, int iter, @@ -907,7 +962,11 @@ class LambdaRankObj : public ObjFunction { LOG(DEBUG) << "Computing " << LambdaWeightComputerT::Name() << " gradients on GPU."; auto device = ctx_->gpu_id; +#if defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); +#elif defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(device)); +#endif bst_float weight_normalization_factor = ComputeWeightNormalizationFactor(info, gptr); diff --git a/src/objective/rank_obj.hip b/src/objective/rank_obj.hip index e69de29bb..d03129d70 100644 --- a/src/objective/rank_obj.hip +++ b/src/objective/rank_obj.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "rank_obj.cu" +#endif