diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh index b1e903821..6168388f0 100644 --- a/src/collective/nccl_device_communicator.cuh +++ b/src/collective/nccl_device_communicator.cuh @@ -36,21 +36,21 @@ class NcclDeviceCommunicator : public DeviceCommunicator { private: static constexpr std::size_t kUuidLength = -#if defined(XGBOOST_USE_HIP) - sizeof(hipUUID) / sizeof(uint64_t); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) sizeof(std::declval().uuid) / sizeof(uint64_t); +#elif defined(XGBOOST_USE_HIP) + sizeof(hipUUID) / sizeof(uint64_t); #endif void GetCudaUUID(xgboost::common::Span const &uuid) const { -#if defined(XGBOOST_USE_HIP) - hipUUID id; - hipDeviceGetUuid(&id, device_ordinal_); - std::memcpy(uuid.data(), static_cast(&id), sizeof(id)); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) cudaDeviceProp prob{}; dh::safe_cuda(cudaGetDeviceProperties(&prob, device_ordinal_)); std::memcpy(uuid.data(), static_cast(&(prob.uuid)), sizeof(prob.uuid)); +#elif defined(XGBOOST_USE_HIP) + hipUUID id; + hipDeviceGetUuid(&id, device_ordinal_); + std::memcpy(uuid.data(), static_cast(&id), sizeof(id)); #endif } diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh index 8bf6bb808..2d80c06d8 100644 --- a/src/common/algorithm.cuh +++ b/src/common/algorithm.cuh @@ -11,10 +11,10 @@ #include // size_t #include // int32_t -#if defined(XGBOOST_USE_HIP) -#include -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) #include // DispatchSegmentedRadixSort,NullType,DoubleBuffer +#elif defined(XGBOOST_USE_HIP) +#include #endif #include // distance diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index feddba99e..c4112ee13 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -175,17 +175,17 @@ void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cu return thrust::min(num_cuts_per_feature, column_size); }); -#if defined(XGBOOST_USE_HIP) +#if defined(XGBOOST_USE_CUDA) + thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it, + cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer()); + thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(), + column_sizes_scan->end(), column_sizes_scan->begin()); +#elif defined(XGBOOST_USE_HIP) thrust::exclusive_scan(thrust::hip::par(alloc), cut_ptr_it, cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer()); thrust::exclusive_scan(thrust::hip::par(alloc), column_sizes_scan->begin(), column_sizes_scan->end(), column_sizes_scan->begin()); -#elif defined(XGBOOST_USE_CUDA) - thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it, - cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer()); - thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(), - column_sizes_scan->end(), column_sizes_scan->begin()); #endif } @@ -309,12 +309,12 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info, &sorted_entries); dh::XGBDeviceAllocator alloc; -#if defined(XGBOOST_USE_HIP) - thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(), - sorted_entries.end(), detail::EntryCompareOp()); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp()); +#elif defined(XGBOOST_USE_HIP) + thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(), + sorted_entries.end(), detail::EntryCompareOp()); #endif if (sketch_container->HasCategorical()) { @@ -374,14 +374,14 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info, return weights[group_idx]; }); -#if defined(XGBOOST_USE_HIP) - auto retit = thrust::copy_if(thrust::hip::par(alloc), +#if defined(XGBOOST_USE_CUDA) + auto retit = thrust::copy_if(thrust::cuda::par(alloc), weight_iter + begin, weight_iter + end, batch_iter + begin, d_temp_weights.data(), // output is_valid); -#elif defined(XGBOOST_USE_CUDA) - auto retit = thrust::copy_if(thrust::cuda::par(alloc), +#elif defined(XGBOOST_USE_HIP) + auto retit = thrust::copy_if(thrust::hip::par(alloc), weight_iter + begin, weight_iter + end, batch_iter + begin, d_temp_weights.data(), // output @@ -397,14 +397,14 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info, return weights[batch.GetElement(idx).row_idx]; }); -#if defined(XGBOOST_USE_HIP) - auto retit = thrust::copy_if(thrust::hip::par(alloc), +#if defined(XGBOOST_USE_CUDA) + auto retit = thrust::copy_if(thrust::cuda::par(alloc), weight_iter + begin, weight_iter + end, batch_iter + begin, d_temp_weights.data(), // output is_valid); -#elif defined(XGBOOST_USE_CUDA) - auto retit = thrust::copy_if(thrust::cuda::par(alloc), +#elif defined(XGBOOST_USE_HIP) + auto retit = thrust::copy_if(thrust::hip::par(alloc), weight_iter + begin, weight_iter + end, batch_iter + begin, d_temp_weights.data(), // output diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index 1eaa15c70..fac254abf 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -184,15 +184,15 @@ class SketchContainer { d_column_scan = this->columns_ptr_.DeviceSpan(); -#if defined(XGBOOST_USE_HIP) +#if defined(XGBOOST_USE_CUDA) size_t n_uniques = dh::SegmentedUnique( - thrust::hip::par(alloc), d_column_scan.data(), + thrust::cuda::par(alloc), d_column_scan.data(), d_column_scan.data() + d_column_scan.size(), entries.data(), entries.data() + entries.size(), scan_out.DevicePointer(), entries.data(), detail::SketchUnique{}, key_comp); -#elif defined(XGBOOST_USE_CUDA) +#elif defined(XGBOOST_USE_HIP) size_t n_uniques = dh::SegmentedUnique( - thrust::cuda::par(alloc), d_column_scan.data(), + thrust::hip::par(alloc), d_column_scan.data(), d_column_scan.data() + d_column_scan.size(), entries.data(), entries.data() + entries.size(), scan_out.DevicePointer(), entries.data(), detail::SketchUnique{}, key_comp); diff --git a/src/common/stats.cuh b/src/common/stats.cuh index d61adc41a..0de654818 100644 --- a/src/common/stats.cuh +++ b/src/common/stats.cuh @@ -217,12 +217,12 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b auto scan_val = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), detail::WeightOp{w_begin, d_sorted_idx}); -#if defined(XGBOOST_USE_HIP) - thrust::inclusive_scan_by_key(thrust::hip::par(caching), scan_key, scan_key + n_weights, - scan_val, weights_cdf.begin()); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights, scan_val, weights_cdf.begin()); +#elif defined(XGBOOST_USE_HIP) + thrust::inclusive_scan_by_key(thrust::hip::par(caching), scan_key, scan_key + n_weights, + scan_val, weights_cdf.begin()); #endif auto n_segments = std::distance(seg_beg, seg_end) - 1; diff --git a/src/metric/auc.cu b/src/metric/auc.cu index 0c24a4829..abbc4e944 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -6,10 +6,10 @@ #include #include -#if defined(XGBOOST_USE_HIP) -#include // NOLINT -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) #include // NOLINT +#elif defined(XGBOOST_USE_HIP) +#include // NOLINT #endif #include @@ -127,16 +127,16 @@ GPUBinaryAUC(common::Span predts, MetaInfo const &info, thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; }); -#if defined(XGBOOST_USE_HIP) - auto end_unique = thrust::unique_by_key_copy( - thrust::hip::par(alloc), uni_key, uni_key + d_sorted_idx.size(), - dh::tbegin(d_unique_idx), thrust::make_discard_iterator(), - dh::tbegin(d_unique_idx)); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) auto end_unique = thrust::unique_by_key_copy( thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(), dh::tbegin(d_unique_idx), thrust::make_discard_iterator(), dh::tbegin(d_unique_idx)); +#elif defined(XGBOOST_USE_HIP) + auto end_unique = thrust::unique_by_key_copy( + thrust::hip::par(alloc), uni_key, uni_key + d_sorted_idx.size(), + dh::tbegin(d_unique_idx), thrust::make_discard_iterator(), + dh::tbegin(d_unique_idx)); #endif d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx)); @@ -179,10 +179,10 @@ GPUBinaryAUC(common::Span predts, MetaInfo const &info, Pair last = cache->fptp.back(); -#if defined(XGBOOST_USE_HIP) - double auc = thrust::reduce(thrust::hip::par(alloc), in, in + d_unique_idx.size()); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size()); +#elif defined(XGBOOST_USE_HIP) + double auc = thrust::reduce(thrust::hip::par(alloc), in, in + d_unique_idx.size()); #endif return std::make_tuple(last.first, last.second, auc); @@ -239,14 +239,14 @@ double ScaleClasses(common::Span results, common::Span local_are double tp_sum; double auc_sum; -#if defined(XGBOOST_USE_HIP) - thrust::tie(auc_sum, tp_sum) = - thrust::reduce(thrust::hip::par(alloc), reduce_in, reduce_in + n_classes, - Pair{0.0, 0.0}, PairPlus{}); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) thrust::tie(auc_sum, tp_sum) = thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes, Pair{0.0, 0.0}, PairPlus{}); +#elif defined(XGBOOST_USE_HIP) + thrust::tie(auc_sum, tp_sum) = + thrust::reduce(thrust::hip::par(alloc), reduce_in, reduce_in + n_classes, + Pair{0.0, 0.0}, PairPlus{}); #endif if (tp_sum != 0 && !std::isnan(auc_sum)) { @@ -329,12 +329,12 @@ void SegmentedReduceAUC(common::Span d_unique_idx, return auc; }); -#if defined(XGBOOST_USE_HIP) - thrust::reduce_by_key(thrust::hip::par(alloc), key_in, +#if defined(XGBOOST_USE_CUDA) + thrust::reduce_by_key(thrust::cuda::par(alloc), key_in, key_in + d_unique_idx.size(), val_in, thrust::make_discard_iterator(), dh::tbegin(d_auc)); -#elif defined(XGBOOST_USE_CUDA) - thrust::reduce_by_key(thrust::cuda::par(alloc), key_in, +#elif defined(XGBOOST_USE_HIP) + thrust::reduce_by_key(thrust::hip::par(alloc), key_in, key_in + d_unique_idx.size(), val_in, thrust::make_discard_iterator(), dh::tbegin(d_auc)); #endif @@ -410,9 +410,9 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, dh::TemporaryArray unique_class_ptr(d_class_ptr.size()); auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr); -#if defined(XGBOOST_USE_HIP) +#if defined(XGBOOST_USE_CUDA) auto n_uniques = dh::SegmentedUniqueByKey( - thrust::hip::par(alloc), + thrust::cuda::par(alloc), dh::tbegin(d_class_ptr), dh::tend(d_class_ptr), uni_key, @@ -421,9 +421,9 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, d_unique_class_ptr.data(), dh::tbegin(d_unique_idx), thrust::equal_to>{}); -#elif defined(XGBOOST_USE_CUDA) +#elif defined(XGBOOST_USE_HIP) auto n_uniques = dh::SegmentedUniqueByKey( - thrust::cuda::par(alloc), + thrust::hip::par(alloc), dh::tbegin(d_class_ptr), dh::tend(d_class_ptr), uni_key, @@ -553,14 +553,14 @@ std::pair GPURankingAUC(Context const *ctx, common::Span< thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; }); -#if defined(XGBOOST_USE_HIP) - size_t n_valid = thrust::count_if( - thrust::hip::par(alloc), check_it, check_it + group_ptr.size() - 1, - [=] XGBOOST_DEVICE(size_t len) { return len >= 3; }); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) size_t n_valid = thrust::count_if( thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1, [=] XGBOOST_DEVICE(size_t len) { return len >= 3; }); +#elif defined(XGBOOST_USE_HIP) + size_t n_valid = thrust::count_if( + thrust::hip::par(alloc), check_it, check_it + group_ptr.size() - 1, + [=] XGBOOST_DEVICE(size_t len) { return len >= 3; }); #endif if (n_valid < info.group_ptr_.size() - 1) { @@ -659,12 +659,12 @@ std::pair GPURankingAUC(Context const *ctx, common::Span< /** * Scale the AUC with number of items in each group. */ -#if defined(XGBOOST_USE_HIP) - double auc = thrust::reduce(thrust::hip::par(alloc), dh::tbegin(s_d_auc), - dh::tend(s_d_auc), 0.0); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc), dh::tend(s_d_auc), 0.0); +#elif defined(XGBOOST_USE_HIP) + double auc = thrust::reduce(thrust::hip::par(alloc), dh::tbegin(s_d_auc), + dh::tend(s_d_auc), 0.0); #endif return std::make_pair(auc, n_valid); @@ -694,14 +694,14 @@ std::tuple GPUBinaryPRAUC(common::Span pred dh::XGBCachingDeviceAllocator alloc; double total_pos, total_neg; -#if defined(XGBOOST_USE_HIP) - thrust::tie(total_pos, total_neg) = - thrust::reduce(thrust::hip::par(alloc), it, it + labels.Size(), - Pair{0.0, 0.0}, PairPlus{}); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) thrust::tie(total_pos, total_neg) = thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(), Pair{0.0, 0.0}, PairPlus{}); +#elif defined(XGBOOST_USE_HIP) + thrust::tie(total_pos, total_neg) = + thrust::reduce(thrust::hip::par(alloc), it, it + labels.Size(), + Pair{0.0, 0.0}, PairPlus{}); #endif if (total_pos <= 0.0 || total_neg <= 0.0) { @@ -755,13 +755,13 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span predts, }); dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_HIP) - thrust::reduce_by_key(thrust::hip::par(alloc), key_it, +#if defined(XGBOOST_USE_CUDA) + thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + predts.size(), val_it, thrust::make_discard_iterator(), totals.begin(), thrust::equal_to{}, PairPlus{}); -#elif defined(XGBOOST_USE_CUDA) - thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, +#elif defined(XGBOOST_USE_HIP) + thrust::reduce_by_key(thrust::hip::par(alloc), key_it, key_it + predts.size(), val_it, thrust::make_discard_iterator(), totals.begin(), thrust::equal_to{}, PairPlus{}); @@ -834,9 +834,9 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, dh::TemporaryArray unique_class_ptr(d_group_ptr.size()); auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr); -#if defined(XGBOOST_USE_HIP) +#if defined(XGBOOST_USE_CUDA) auto n_uniques = dh::SegmentedUniqueByKey( - thrust::hip::par(alloc), + thrust::cuda::par(alloc), dh::tbegin(d_group_ptr), dh::tend(d_group_ptr), uni_key, @@ -845,9 +845,9 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, d_unique_class_ptr.data(), dh::tbegin(d_unique_idx), thrust::equal_to>{}); -#elif defined(XGBOOST_USE_CUDA) +#elif defined(XGBOOST_USE_HIP) auto n_uniques = dh::SegmentedUniqueByKey( - thrust::cuda::par(alloc), + thrust::hip::par(alloc), dh::tbegin(d_group_ptr), dh::tend(d_group_ptr), uni_key, @@ -909,14 +909,14 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, return thrust::make_pair(0.0, static_cast(1)); }); -#if defined(XGBOOST_USE_HIP) - thrust::tie(auc, invalid_groups) = thrust::reduce( - thrust::hip::par(alloc), it, it + n_groups, - thrust::pair(0.0, 0), PairPlus{}); -#elif defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) thrust::tie(auc, invalid_groups) = thrust::reduce( thrust::cuda::par(alloc), it, it + n_groups, thrust::pair(0.0, 0), PairPlus{}); +#elif defined(XGBOOST_USE_HIP) + thrust::tie(auc, invalid_groups) = thrust::reduce( + thrust::hip::par(alloc), it, it + n_groups, + thrust::pair(0.0, 0), PairPlus{}); #endif } return std::make_pair(auc, n_groups - invalid_groups); @@ -949,13 +949,13 @@ std::pair GPURankingPRAUC(Context const *ctx, dh::XGBDeviceAllocator alloc; auto labels = info.labels.View(ctx->Device()); -#if defined(XGBOOST_USE_HIP) - if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()), +#if defined(XGBOOST_USE_CUDA) + if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()), dh::tend(labels.Values()), PRAUCLabelInvalid{})) { InvalidLabels(); } -#elif defined(XGBOOST_USE_CUDA) - if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()), +#elif defined(XGBOOST_USE_HIP) + if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()), dh::tend(labels.Values()), PRAUCLabelInvalid{})) { InvalidLabels(); } @@ -981,13 +981,13 @@ std::pair GPURankingPRAUC(Context const *ctx, return thrust::make_pair(y * w, (1.0 - y) * w); }); -#if defined(XGBOOST_USE_HIP) - thrust::reduce_by_key(thrust::hip::par(alloc), key_it, +#if defined(XGBOOST_USE_CUDA) + thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + predts.size(), val_it, thrust::make_discard_iterator(), totals.begin(), thrust::equal_to{}, PairPlus{}); -#elif defined(XGBOOST_USE_CUDA) - thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, +#elif defined(XGBOOST_USE_HIP) + thrust::reduce_by_key(thrust::hip::par(alloc), key_it, key_it + predts.size(), val_it, thrust::make_discard_iterator(), totals.begin(), thrust::equal_to{}, PairPlus{}); diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index 937e31400..f52b28fd1 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -62,6 +62,21 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) { return PackedReduceResult{v, wt}; }, PackedReduceResult{}, thrust::plus()); +#elif defined(XGBOOST_USE_HIP) + dh::XGBCachingDeviceAllocator alloc; + thrust::counting_iterator begin(0); + thrust::counting_iterator end = begin + labels.Size(); + result = thrust::transform_reduce( + thrust::hip::par(alloc), begin, end, + [=] XGBOOST_DEVICE(size_t i) { + auto idx = linalg::UnravelIndex(i, labels.Shape()); + auto sample_id = std::get<0>(idx); + auto target_id = std::get<1>(idx); + auto res = loss(i, sample_id, target_id); + float v{std::get<0>(res)}, wt{std::get<1>(res)}; + return PackedReduceResult{v, wt}; + }, + PackedReduceResult{}, thrust::plus()); #else common::AssertGPUSupport(); #endif // defined(XGBOOST_USE_CUDA) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 542a7b6a5..70cbca529 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -11,7 +11,9 @@ #include "evaluate_splits.cuh" #include "expand_entry.cuh" -#if defined(XGBOOST_USE_HIP) +#if defined(XGBOOST_USE_CUDA) +#define WARP_SIZE 32 +#elif defined(XGBOOST_USE_HIP) #include #ifdef __AMDGCN_WAVEFRONT_SIZE @@ -20,8 +22,6 @@ #endif #define WARP_SIZE WAVEFRONT_SIZE -#elif defined(XGBOOST_USE_CUDA) -#define WARP_SIZE 32 #endif #if defined(XGBOOST_USE_HIP)