diff --git a/src/collective/device_communicator.cuh b/src/collective/device_communicator.cuh index 32d69e1b5..b10b86614 100644 --- a/src/collective/device_communicator.cuh +++ b/src/collective/device_communicator.cuh @@ -4,7 +4,11 @@ #pragma once #include +#if defined(XGBOOST_USE_HIP) +#include "../common/device_helpers.hip.h" +#elif defined(XGBOOST_USE_CUDA) #include "../common/device_helpers.cuh" +#endif namespace xgboost { namespace collective { diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh index b1c5a4271..1356b8e23 100644 --- a/src/common/algorithm.cuh +++ b/src/common/algorithm.cuh @@ -10,14 +10,26 @@ #include // size_t #include // int32_t + +#if defined(XGBOOST_USE_HIP) +#include +#elif defined(XGBOOST_USE_CUDA) #include // DispatchSegmentedRadixSort,NullType,DoubleBuffer +#endif + #include // distance #include // numeric_limits #include // conditional_t,remove_const_t #include "common.h" // safe_cuda #include "cuda_context.cuh" // CUDAContext + +#if defined(XGBOOST_USE_HIP) +#include "device_helpers.hip.h" +#elif defined(XGBOOST_USE_CUDA) #include "device_helpers.cuh" // TemporaryArray,SegmentId,LaunchN,Iota,device_vector +#endif + #include "xgboost/base.h" // XGBOOST_DEVICE #include "xgboost/context.h" // Context #include "xgboost/logging.h" // CHECK @@ -39,6 +51,7 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st using OffsetT = int; // Null value type +#if defined(XGBOOST_USE_CUDA) cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); cub::DoubleBuffer d_values; @@ -47,6 +60,20 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, num_segments, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, ctx->Stream(), debug_synchronous))); +#elif defined(XGBOOST_USE_HIP) + if (IS_DESCENDING) { + rocprim::segmented_radix_sort_pairs_desc(d_temp_storage, + temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items, + num_segments, d_begin_offsets, d_end_offsets, + begin_bit, end_bit, ctx->Stream(), debug_synchronous); + } + else { + rocprim::segmented_radix_sort_pairs(d_temp_storage, + temp_storage_bytes, d_keys_in, d_keys_out, nullptr, nullptr, num_items, + num_segments, d_begin_offsets, d_end_offsets, + begin_bit, end_bit, ctx->Stream(), debug_synchronous); + } +#endif } // Wrapper around cub sort for easier `descending` sort. @@ -60,14 +87,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { +#if defined(XGBOOST_USE_CUDA) cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); cub::DoubleBuffer d_values(const_cast(d_values_in), d_values_out); +#endif + // In old version of cub, num_items in dispatch is also int32_t, no way to change. using OffsetT = std::conditional_t(), std::size_t, std::int32_t>; CHECK_LE(num_items, std::numeric_limits::max()); // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation +#if defined(XGBOOST_USE_CUDA) #if THRUST_MAJOR_VERSION >= 2 dh::safe_cuda((cub::DispatchSegmentedRadixSort< descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, @@ -88,6 +119,18 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage, d_begin_offsets, d_end_offsets, begin_bit, end_bit, false, stream, false))); #endif +#elif defined(XGBOOST_USE_HIP) + if (descending) { + rocprim::segmented_radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + d_values_in, d_values_out, num_items, num_segments, + d_begin_offsets, d_end_offsets, begin_bit, end_bit, stream, false); + } + else { + rocprim::segmented_radix_sort_pairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + d_values_in, d_values_out, num_items, num_segments, d_begin_offsets, d_end_offsets, + begin_bit, end_bit, stream, false); + } +#endif } } // namespace detail diff --git a/src/common/device_helpers.hip.h b/src/common/device_helpers.hip.h index 618efdd39..36c783b49 100644 --- a/src/common/device_helpers.hip.h +++ b/src/common/device_helpers.hip.h @@ -1208,8 +1208,7 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, #endif #endif - safe_cuda((rocprim::inclusive_scan(nullptr, - bytes, d_in, d_out, num_items, scan_op))); + safe_cuda((rocprim::inclusive_scan(nullptr, bytes, d_in, d_out, (size_t) num_items, scan_op))); TemporaryArray storage(bytes); @@ -1229,8 +1228,7 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, #endif #endif - safe_cuda((rocprim::inclusive_scan( - storage.data().get(), bytes, d_in, d_out, num_items, scan_op))); + safe_cuda((rocprim::inclusive_scan(storage.data().get(), bytes, d_in, d_out, (size_t) num_items, scan_op))); } template @@ -1262,11 +1260,7 @@ void ArgSort(xgboost::common::Span keys, xgboost::common::Span sorted_i using ValueT = std::remove_const_t; TemporaryArray out(keys.size()); - hipcub::DoubleBuffer d_keys(const_cast(keys.data()), - out.data().get()); TemporaryArray sorted_idx_out(sorted_idx.size()); - hipcub::DoubleBuffer d_values(const_cast(sorted_idx.data()), - sorted_idx_out.data().get()); // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support using OffsetT = std::conditional_t; @@ -1286,8 +1280,8 @@ void ArgSort(xgboost::common::Span keys, xgboost::common::Span sorted_i #endif #endif - safe_cuda((rocprim::radix_sort_pairs(d_temp_storage, - bytes, d_keys, d_values, sorted_idx.size(), 0, + safe_cuda((rocprim::radix_sort_pairs(d_temp_storage, + bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0, sizeof(KeyT) * 8))); TemporaryArray storage(bytes); @@ -1305,8 +1299,8 @@ void ArgSort(xgboost::common::Span keys, xgboost::common::Span sorted_i #endif #endif - safe_cuda((rocprim::radix_sort_pairs(d_temp_storage, - bytes, d_keys, d_values, sorted_idx.size(), 0, + safe_cuda((rocprim::radix_sort_pairs(d_temp_storage, + bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0, sizeof(KeyT) * 8))); } else { void *d_temp_storage = nullptr; @@ -1323,8 +1317,8 @@ void ArgSort(xgboost::common::Span keys, xgboost::common::Span sorted_i #endif #endif - safe_cuda((rocprim::radix_sort_pairs(d_temp_storage, - bytes, d_keys, d_values, sorted_idx.size(), 0, + safe_cuda((rocprim::radix_sort_pairs(d_temp_storage, + bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0, sizeof(KeyT) * 8))); TemporaryArray storage(bytes); @@ -1341,8 +1335,8 @@ void ArgSort(xgboost::common::Span keys, xgboost::common::Span sorted_i sizeof(KeyT) * 8, false, nullptr, false))); #endif #endif - safe_cuda((rocprim::radix_sort_pairs(d_temp_storage, - bytes, d_keys, d_values, sorted_idx.size(), 0, + safe_cuda((rocprim::radix_sort_pairs(d_temp_storage, + bytes, keys.data(), out.data().get(), sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), 0, sizeof(KeyT) * 8))); } diff --git a/src/common/threading_utils.cuh b/src/common/threading_utils.cuh index 5ff78144d..1ca922993 100644 --- a/src/common/threading_utils.cuh +++ b/src/common/threading_utils.cuh @@ -9,7 +9,13 @@ #include "./math.h" // Sqr #include "common.h" + +#if defined(XGBOOST_USE_HIP) +#include "device_helpers.hip.h" +#elif defined(XGBOOST_USE_CUDA) #include "device_helpers.cuh" // LaunchN +#endif + #include "xgboost/base.h" // XGBOOST_DEVICE #include "xgboost/span.h" // Span @@ -67,7 +73,7 @@ SegmentedTrapezoidThreads(xgboost::common::Span group_ptr, dh::safe_cuda(hipMemcpy( &total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, sizeof(total), hipMemcpyDeviceToHost)); -#else +#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaMemcpy( &total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, sizeof(total), cudaMemcpyDeviceToHost)); diff --git a/src/metric/auc.cc b/src/metric/auc.cc index a926c2c5b..d8a32d201 100644 --- a/src/metric/auc.cc +++ b/src/metric/auc.cc @@ -393,7 +393,7 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc") .describe("Receiver Operating Characteristic Area Under the Curve.") .set_body([](const char*) { return new EvalROCAUC(); }); -#if !defined(XGBOOST_USE_CUDA) +#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) std::tuple GPUBinaryROCAUC(common::Span, MetaInfo const &, std::int32_t, std::shared_ptr *) { @@ -414,7 +414,7 @@ std::pair GPURankingAUC(Context const *, common::Span { std::shared_ptr d_cache_; @@ -471,7 +471,7 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr") .describe("Area under PR curve for both classification and rank.") .set_body([](char const *) { return new EvalPRAUC{}; }); -#if !defined(XGBOOST_USE_CUDA) +#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP) std::tuple GPUBinaryPRAUC(common::Span, MetaInfo const &, std::int32_t, std::shared_ptr *) { common::AssertGPUSupport(); diff --git a/src/metric/auc.cu b/src/metric/auc.cu index fdbf0501a..62db02a00 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -5,7 +5,13 @@ #include #include + +#if defined(XGBOOST_USE_HIP) +#include // NOLINT +#elif defined(XGBOOST_USE_CUDA) #include // NOLINT +#endif + #include #include #include @@ -89,7 +95,12 @@ GPUBinaryAUC(common::Span predts, MetaInfo const &info, Fn area_fn, std::shared_ptr cache) { auto labels = info.labels.View(device); auto weights = info.weights_.ConstDeviceSpan(); + +#if defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(device)); +#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); +#endif CHECK_NE(labels.Size(), 0); CHECK_EQ(labels.Size(), predts.size()); @@ -120,10 +131,19 @@ GPUBinaryAUC(common::Span predts, MetaInfo const &info, auto uni_key = dh::MakeTransformIterator( thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; }); + +#if defined(XGBOOST_USE_HIP) + auto end_unique = thrust::unique_by_key_copy( + thrust::hip::par(alloc), uni_key, uni_key + d_sorted_idx.size(), + dh::tbegin(d_unique_idx), thrust::make_discard_iterator(), + dh::tbegin(d_unique_idx)); +#elif defined(XGBOOST_USE_CUDA) auto end_unique = thrust::unique_by_key_copy( thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(), dh::tbegin(d_unique_idx), thrust::make_discard_iterator(), dh::tbegin(d_unique_idx)); +#endif + d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx)); dh::InclusiveScan(dh::tbegin(d_fptp), dh::tbegin(d_fptp), @@ -163,7 +183,13 @@ GPUBinaryAUC(common::Span predts, MetaInfo const &info, }); Pair last = cache->fptp.back(); + +#if defined(XGBOOST_USE_HIP) + double auc = thrust::reduce(thrust::hip::par(alloc), in, in + d_unique_idx.size()); +#elif defined(XGBOOST_USE_CUDA) double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size()); +#endif + return std::make_tuple(last.first, last.second, auc); } @@ -218,9 +244,17 @@ double ScaleClasses(common::Span results, common::Span local_are double tp_sum; double auc_sum; + +#if defined(XGBOOST_USE_HIP) + thrust::tie(auc_sum, tp_sum) = + thrust::reduce(thrust::hip::par(alloc), reduce_in, reduce_in + n_classes, + Pair{0.0, 0.0}, PairPlus{}); +#elif defined(XGBOOST_USE_CUDA) thrust::tie(auc_sum, tp_sum) = thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes, Pair{0.0, 0.0}, PairPlus{}); +#endif + if (tp_sum != 0 && !std::isnan(auc_sum)) { auc_sum /= tp_sum; } else { @@ -300,9 +334,16 @@ void SegmentedReduceAUC(common::Span d_unique_idx, double auc = area_fn(fp_prev, fp, tp_prev, tp, class_id); return auc; }); + +#if defined(XGBOOST_USE_HIP) + thrust::reduce_by_key(thrust::hip::par(alloc), key_in, + key_in + d_unique_idx.size(), val_in, + thrust::make_discard_iterator(), dh::tbegin(d_auc)); +#elif defined(XGBOOST_USE_CUDA) thrust::reduce_by_key(thrust::cuda::par(alloc), key_in, key_in + d_unique_idx.size(), val_in, thrust::make_discard_iterator(), dh::tbegin(d_auc)); +#endif } /** @@ -312,7 +353,12 @@ void SegmentedReduceAUC(common::Span d_unique_idx, template double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span d_class_ptr, size_t n_classes, std::shared_ptr cache, Fn area_fn) { +#if defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(device)); +#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(device)); +#endif + /** * Sorted idx */ @@ -373,6 +419,19 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span unique_class_ptr(d_class_ptr.size()); auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr); + +#if defined(XGBOOST_USE_HIP) + auto n_uniques = dh::SegmentedUniqueByKey( + thrust::hip::par(alloc), + dh::tbegin(d_class_ptr), + dh::tend(d_class_ptr), + uni_key, + uni_key + d_sorted_idx.size(), + dh::tbegin(d_unique_idx), + d_unique_class_ptr.data(), + dh::tbegin(d_unique_idx), + thrust::equal_to>{}); +#elif defined(XGBOOST_USE_CUDA) auto n_uniques = dh::SegmentedUniqueByKey( thrust::cuda::par(alloc), dh::tbegin(d_class_ptr), @@ -383,6 +442,8 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span>{}); +#endif + d_unique_idx = d_unique_idx.subspan(0, n_uniques); auto get_class_id = [=] XGBOOST_DEVICE(size_t idx) { return idx / n_samples; }; @@ -500,9 +561,17 @@ std::pair GPURankingAUC(Context const *ctx, common::Span< auto check_it = dh::MakeTransformIterator( thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; }); + +#if defined(XGBOOST_USE_HIP) + size_t n_valid = thrust::count_if( + thrust::hip::par(alloc), check_it, check_it + group_ptr.size() - 1, + [=] XGBOOST_DEVICE(size_t len) { return len >= 3; }); +#elif defined(XGBOOST_USE_CUDA) size_t n_valid = thrust::count_if( thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1, [=] XGBOOST_DEVICE(size_t len) { return len >= 3; }); +#endif + if (n_valid < info.group_ptr_.size() - 1) { InvalidGroupAUC(); } @@ -599,8 +668,14 @@ std::pair GPURankingAUC(Context const *ctx, common::Span< /** * Scale the AUC with number of items in each group. */ +#if defined(XGBOOST_USE_HIP) + double auc = thrust::reduce(thrust::hip::par(alloc), dh::tbegin(s_d_auc), + dh::tend(s_d_auc), 0.0); +#elif defined(XGBOOST_USE_CUDA) double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc), dh::tend(s_d_auc), 0.0); +#endif + return std::make_pair(auc, n_valid); } @@ -627,9 +702,16 @@ std::tuple GPUBinaryPRAUC(common::Span pred }); dh::XGBCachingDeviceAllocator alloc; double total_pos, total_neg; + +#if defined(XGBOOST_USE_HIP) + thrust::tie(total_pos, total_neg) = + thrust::reduce(thrust::hip::par(alloc), it, it + labels.Size(), + Pair{0.0, 0.0}, PairPlus{}); +#elif defined(XGBOOST_USE_CUDA) thrust::tie(total_pos, total_neg) = thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(), Pair{0.0, 0.0}, PairPlus{}); +#endif if (total_pos <= 0.0 || total_neg <= 0.0) { return {0.0f, 0.0f, 0.0f}; @@ -681,10 +763,18 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span predts, return thrust::make_pair(y * w, (1.0f - y) * w); }); dh::XGBCachingDeviceAllocator alloc; + +#if defined(XGBOOST_USE_HIP) + thrust::reduce_by_key(thrust::hip::par(alloc), key_it, + key_it + predts.size(), val_it, + thrust::make_discard_iterator(), totals.begin(), + thrust::equal_to{}, PairPlus{}); +#elif defined(XGBOOST_USE_CUDA) thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + predts.size(), val_it, thrust::make_discard_iterator(), totals.begin(), thrust::equal_to{}, PairPlus{}); +#endif /** * Calculate AUC @@ -752,6 +842,19 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, // unique values are sparse, so we need a CSR style indptr dh::TemporaryArray unique_class_ptr(d_group_ptr.size()); auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr); + +#if defined(XGBOOST_USE_HIP) + auto n_uniques = dh::SegmentedUniqueByKey( + thrust::hip::par(alloc), + dh::tbegin(d_group_ptr), + dh::tend(d_group_ptr), + uni_key, + uni_key + d_sorted_idx.size(), + dh::tbegin(d_unique_idx), + d_unique_class_ptr.data(), + dh::tbegin(d_unique_idx), + thrust::equal_to>{}); +#elif defined(XGBOOST_USE_CUDA) auto n_uniques = dh::SegmentedUniqueByKey( thrust::cuda::par(alloc), dh::tbegin(d_group_ptr), @@ -762,6 +865,8 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, d_unique_class_ptr.data(), dh::tbegin(d_unique_idx), thrust::equal_to>{}); +#endif + d_unique_idx = d_unique_idx.subspan(0, n_uniques); auto get_group_id = [=] XGBOOST_DEVICE(size_t idx) { @@ -812,9 +917,16 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, } return thrust::make_pair(0.0, static_cast(1)); }); + +#if defined(XGBOOST_USE_HIP) + thrust::tie(auc, invalid_groups) = thrust::reduce( + thrust::hip::par(alloc), it, it + n_groups, + thrust::pair(0.0, 0), PairPlus{}); +#elif defined(XGBOOST_USE_CUDA) thrust::tie(auc, invalid_groups) = thrust::reduce( thrust::cuda::par(alloc), it, it + n_groups, thrust::pair(0.0, 0), PairPlus{}); +#endif } return std::make_pair(auc, n_groups - invalid_groups); } @@ -823,7 +935,12 @@ std::pair GPURankingPRAUC(Context const *ctx, common::Span predts, MetaInfo const &info, std::shared_ptr *p_cache) { +#if defined(XGBOOST_USE_HIP) + dh::safe_cuda(hipSetDevice(ctx->gpu_id)); +#elif defined(XGBOOST_USE_CUDA) dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); +#endif + if (predts.empty()) { return std::make_pair(0.0, static_cast(0)); } @@ -845,10 +962,19 @@ std::pair GPURankingPRAUC(Context const *ctx, dh::XGBDeviceAllocator alloc; auto labels = info.labels.View(ctx->gpu_id); + +#if defined(XGBOOST_USE_HIP) + if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()), + dh::tend(labels.Values()), PRAUCLabelInvalid{})) { + InvalidLabels(); + } +#elif defined(XGBOOST_USE_CUDA) if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()), dh::tend(labels.Values()), PRAUCLabelInvalid{})) { InvalidLabels(); } +#endif + /** * Get total positive/negative for each group. */ @@ -868,10 +994,18 @@ std::pair GPURankingPRAUC(Context const *ctx, auto y = labels(i); return thrust::make_pair(y * w, (1.0 - y) * w); }); + +#if defined(XGBOOST_USE_HIP) + thrust::reduce_by_key(thrust::hip::par(alloc), key_it, + key_it + predts.size(), val_it, + thrust::make_discard_iterator(), totals.begin(), + thrust::equal_to{}, PairPlus{}); +#elif defined(XGBOOST_USE_CUDA) thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + predts.size(), val_it, thrust::make_discard_iterator(), totals.begin(), thrust::equal_to{}, PairPlus{}); +#endif /** * Calculate AUC diff --git a/src/metric/auc.hip b/src/metric/auc.hip index e69de29bb..a96cbbde5 100644 --- a/src/metric/auc.hip +++ b/src/metric/auc.hip @@ -0,0 +1,4 @@ + +#if defined(XGBOOST_USE_HIP) +#include "auc.cu" +#endif