diff --git a/src/common/cuda_context.cuh b/src/common/cuda_context.cuh index dce5a9858..17896460f 100644 --- a/src/common/cuda_context.cuh +++ b/src/common/cuda_context.cuh @@ -6,6 +6,12 @@ #include #include "device_helpers.cuh" +#ifdef XGBOOST_USE_HIP +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { struct CUDAContext { private: @@ -17,37 +23,21 @@ struct CUDAContext { * \brief Caching thrust policy. */ auto CTP() const { -#if defined(XGBOOST_USE_CUDA) #if THRUST_MAJOR_VERSION >= 2 return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream()); #else return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream()); #endif // THRUST_MAJOR_VERSION >= 2 -#elif defined(XGBOOST_USE_HIP) -#if THRUST_MAJOR_VERSION >= 2 - return thrust::hip::par_nosync(caching_alloc_).on(dh::DefaultStream()); -#else - return thrust::hip::par(caching_alloc_).on(dh::DefaultStream()); -#endif // THRUST_MAJOR_VERSION >= 2 -#endif } /** * \brief Thrust policy without caching allocator. */ auto TP() const { -#if defined(XGBOOST_USE_CUDA) #if THRUST_MAJOR_VERSION >= 2 return thrust::cuda::par_nosync(alloc_).on(dh::DefaultStream()); #else return thrust::cuda::par(alloc_).on(dh::DefaultStream()); #endif // THRUST_MAJOR_VERSION >= 2 -#elif defined(XGBOOST_USE_HIP) -#if THRUST_MAJOR_VERSION >= 2 - return thrust::hip::par_nosync(alloc_).on(dh::DefaultStream()); -#else - return thrust::hip::par(alloc_).on(dh::DefaultStream()); -#endif // THRUST_MAJOR_VERSION >= 2 -#endif } auto Stream() const { return dh::DefaultStream(); } }; diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index 7bdd90eb9..bd0c894f0 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -26,6 +26,12 @@ #include "quantile.h" #include "xgboost/host_device_vector.h" +#ifdef XGBOOST_USE_HIP +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost::common { constexpr float SketchContainer::kFactor; @@ -112,7 +118,6 @@ void SortByWeight(dh::device_vector* weights, dh::device_vector* s // Sort both entries and wegihts. dh::XGBDeviceAllocator alloc; CHECK_EQ(weights->size(), sorted_entries->size()); -#if defined(XGBOOST_USE_CUDA) thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(), sorted_entries->end(), weights->begin(), detail::EntryCompareOp()); @@ -122,17 +127,6 @@ void SortByWeight(dh::device_vector* weights, dh::device_vector* s thrust::cuda::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(), weights->begin(), [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; }); -#elif defined(XGBOOST_USE_HIP) - thrust::sort_by_key(thrust::hip::par(alloc), sorted_entries->begin(), sorted_entries->end(), - weights->begin(), detail::EntryCompareOp()); - - // Scan weights - dh::XGBCachingDeviceAllocator caching; - thrust::inclusive_scan_by_key( - thrust::hip::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(), - weights->begin(), - [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; }); -#endif } void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span d_cuts_ptr, diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index c4112ee13..aec733ddc 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -19,6 +19,10 @@ #if defined(XGBOOST_USE_HIP) namespace cub = hipcub; + +namespace thrust { + namespace cuda = thrust::hip; +} #endif namespace xgboost::common { @@ -175,18 +179,10 @@ void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cu return thrust::min(num_cuts_per_feature, column_size); }); -#if defined(XGBOOST_USE_CUDA) thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it, cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer()); thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(), column_sizes_scan->end(), column_sizes_scan->begin()); -#elif defined(XGBOOST_USE_HIP) - thrust::exclusive_scan(thrust::hip::par(alloc), cut_ptr_it, - cut_ptr_it + column_sizes_scan->size(), - cuts_ptr->DevicePointer()); - thrust::exclusive_scan(thrust::hip::par(alloc), column_sizes_scan->begin(), - column_sizes_scan->end(), column_sizes_scan->begin()); -#endif } inline size_t constexpr BytesPerElement(bool has_weight) { @@ -309,13 +305,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info, &sorted_entries); dh::XGBDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(), sorted_entries.end(), detail::EntryCompareOp()); -#elif defined(XGBOOST_USE_HIP) - thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(), - sorted_entries.end(), detail::EntryCompareOp()); -#endif if (sketch_container->HasCategorical()) { auto d_cuts_ptr = cuts_ptr.DeviceSpan(); @@ -374,19 +365,11 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info, return weights[group_idx]; }); -#if defined(XGBOOST_USE_CUDA) auto retit = thrust::copy_if(thrust::cuda::par(alloc), weight_iter + begin, weight_iter + end, batch_iter + begin, d_temp_weights.data(), // output is_valid); -#elif defined(XGBOOST_USE_HIP) - auto retit = thrust::copy_if(thrust::hip::par(alloc), - weight_iter + begin, weight_iter + end, - batch_iter + begin, - d_temp_weights.data(), // output - is_valid); -#endif CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size()); } else { @@ -397,19 +380,11 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info, return weights[batch.GetElement(idx).row_idx]; }); -#if defined(XGBOOST_USE_CUDA) auto retit = thrust::copy_if(thrust::cuda::par(alloc), weight_iter + begin, weight_iter + end, batch_iter + begin, d_temp_weights.data(), // output is_valid); -#elif defined(XGBOOST_USE_HIP) - auto retit = thrust::copy_if(thrust::hip::par(alloc), - weight_iter + begin, weight_iter + end, - batch_iter + begin, - d_temp_weights.data(), // output - is_valid); -#endif CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size()); } diff --git a/src/common/numeric.cu b/src/common/numeric.cu index 8d115506a..c25ee2c6a 100644 --- a/src/common/numeric.cu +++ b/src/common/numeric.cu @@ -8,18 +8,19 @@ #include "xgboost/context.h" // Context #include "xgboost/host_device_vector.h" // HostDeviceVector +#ifdef XGBOOST_USE_HIP +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost::common::cuda_impl { double Reduce(Context const* ctx, HostDeviceVector const& values) { values.SetDevice(ctx->Device()); auto const d_values = values.ConstDeviceSpan(); dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0, thrust::plus{}); -#elif defined(XGBOOST_USE_HIP) - return dh::Reduce(thrust::hip::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0, - thrust::plus{}); -#endif } } // namespace xgboost::common::cuda_impl diff --git a/src/common/quantile.cu b/src/common/quantile.cu index 6040e266f..849b19480 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -22,6 +22,12 @@ #include "transform_iterator.h" // MakeIndexTransformIter #include "xgboost/span.h" +#ifdef XGBOOST_USE_HIP +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { namespace common { @@ -147,7 +153,6 @@ common::Span> MergePath( // We reuse the memory for storing merge path. common::Span merge_path{reinterpret_cast(out.data()), out.size()}; // Determine the merge path, 0 if element is from x, 1 if it's from y. -#if defined(XGBOOST_USE_CUDA) thrust::merge_by_key( thrust::cuda::par(alloc), x_merge_key_it, x_merge_key_it + d_x.size(), y_merge_key_it, y_merge_key_it + d_y.size(), x_merge_val_it, @@ -160,36 +165,15 @@ common::Span> MergePath( } return l_column_id < r_column_id; }); -#elif defined(XGBOOST_USE_HIP) - thrust::merge_by_key( - thrust::hip::par(alloc), x_merge_key_it, x_merge_key_it + d_x.size(), - y_merge_key_it, y_merge_key_it + d_y.size(), x_merge_val_it, - y_merge_val_it, thrust::make_discard_iterator(), merge_path.data(), - [=] __device__(auto const &l, auto const &r) -> bool { - auto l_column_id = thrust::get<0>(l); - auto r_column_id = thrust::get<0>(r); - if (l_column_id == r_column_id) { - return thrust::get<1>(l).value < thrust::get<1>(r).value; - } - return l_column_id < r_column_id; - }); -#endif // Compute output ptr auto transform_it = thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data())); -#if defined(XGBOOST_USE_CUDA) thrust::transform( thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(), out_ptr.data(), [] __device__(auto const& t) { return thrust::get<0>(t) + thrust::get<1>(t); }); -#elif defined(XGBOOST_USE_HIP) - thrust::transform( - thrust::hip::par(alloc), transform_it, transform_it + x_ptr.size(), - out_ptr.data(), - [] __device__(auto const& t) { return thrust::get<0>(t) + thrust::get<1>(t); }); -#endif // 0^th is the indicator, 1^th is placeholder auto get_ind = []XGBOOST_DEVICE(Tuple const& t) { return thrust::get<0>(t); }; @@ -215,7 +199,6 @@ common::Span> MergePath( // comparison, index of y is incremented by 1 from y_0 to y_1, and at the same time, y_0 // is landed into output as the first element in merge result. The scan result is the // subscript of x and y. -#if defined(XGBOOST_USE_CUDA) thrust::exclusive_scan_by_key( thrust::cuda::par(alloc), scan_key_it, scan_key_it + merge_path.size(), scan_val_it, merge_path.data(), @@ -224,16 +207,6 @@ common::Span> MergePath( [=] __device__(Tuple const &l, Tuple const &r) -> Tuple { return thrust::make_tuple(get_x(l) + get_x(r), get_y(l) + get_y(r)); }); -#elif defined(XGBOOST_USE_HIP) - thrust::exclusive_scan_by_key( - thrust::hip::par(alloc), scan_key_it, scan_key_it + merge_path.size(), - scan_val_it, merge_path.data(), - thrust::make_tuple(0ul, 0ul), - thrust::equal_to{}, - [=] __device__(Tuple const &l, Tuple const &r) -> Tuple { - return thrust::make_tuple(get_x(l) + get_x(r), get_y(l) + get_y(r)); - }); -#endif return merge_path; } @@ -414,7 +387,6 @@ size_t SketchContainer::ScanInput(Span entries, Span d_col // Reverse scan to accumulate weights into first duplicated element on left. auto val_it = thrust::make_reverse_iterator(dh::tend(entries)); -#if defined(XGBOOST_USE_CUDA) thrust::inclusive_scan_by_key( thrust::cuda::par(alloc), key_it, key_it + entries.size(), val_it, val_it, @@ -428,21 +400,6 @@ size_t SketchContainer::ScanInput(Span entries, Span d_col } return l; }); -#elif defined(XGBOOST_USE_HIP) - thrust::inclusive_scan_by_key( - thrust::hip::par(alloc), key_it, key_it + entries.size(), - val_it, val_it, - thrust::equal_to{}, - [] __device__(SketchEntry const &r, SketchEntry const &l) { - // Only accumulate for the first type of duplication. - if (l.value - r.value == 0 && l.rmin - r.rmin != 0) { - auto w = l.wmin + r.wmin; - SketchEntry v{l.rmin, l.rmin + w, w, l.value}; - return v; - } - return l; - }); -#endif auto d_columns_ptr_out = columns_ptr_b_.DeviceSpan(); // thrust unique_by_key preserves the first element. @@ -691,7 +648,6 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { // track of the unique keys (feature indices) after the thrust::reduce_by_key` call. dh::caching_device_vector d_max_keys(d_in_columns_ptr.size() - 1); dh::caching_device_vector d_max_values(d_in_columns_ptr.size() - 1); -#if defined(XGBOOST_USE_CUDA) auto new_end = thrust::reduce_by_key( thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(), d_max_values.begin(), thrust::equal_to{}, @@ -705,21 +661,6 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { default_entry); thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(), d_max_keys.begin(), d_max_results.begin()); -#elif defined(XGBOOST_USE_HIP) - auto new_end = thrust::reduce_by_key( - thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(), - d_max_values.begin(), thrust::equal_to{}, - [] __device__(auto l, auto r) { return l.value > r.value ? l : r; }); - d_max_keys.erase(new_end.first, d_max_keys.end()); - d_max_values.erase(new_end.second, d_max_values.end()); - - // The device vector needs to be initialized explicitly since we may have some missing columns. - SketchEntry default_entry{}; - dh::caching_device_vector d_max_results(d_in_columns_ptr.size() - 1, - default_entry); - thrust::scatter(thrust::hip::par(alloc), d_max_values.begin(), d_max_values.end(), - d_max_keys.begin(), d_max_results.begin()); -#endif dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results)); auto max_it = MakeIndexTransformIter([&](auto i) { if (IsCat(h_feature_types, i)) { diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index fac254abf..63d7d1e5a 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -10,6 +10,12 @@ #include "timer.h" #include "categorical.h" +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { namespace common { @@ -184,19 +190,11 @@ class SketchContainer { d_column_scan = this->columns_ptr_.DeviceSpan(); -#if defined(XGBOOST_USE_CUDA) size_t n_uniques = dh::SegmentedUnique( thrust::cuda::par(alloc), d_column_scan.data(), d_column_scan.data() + d_column_scan.size(), entries.data(), entries.data() + entries.size(), scan_out.DevicePointer(), entries.data(), detail::SketchUnique{}, key_comp); -#elif defined(XGBOOST_USE_HIP) - size_t n_uniques = dh::SegmentedUnique( - thrust::hip::par(alloc), d_column_scan.data(), - d_column_scan.data() + d_column_scan.size(), entries.data(), - entries.data() + entries.size(), scan_out.DevicePointer(), - entries.data(), detail::SketchUnique{}, key_comp); -#endif this->columns_ptr_.Copy(scan_out); CHECK(!this->columns_ptr_.HostCanRead()); diff --git a/src/common/stats.cuh b/src/common/stats.cuh index 0de654818..5c909a830 100644 --- a/src/common/stats.cuh +++ b/src/common/stats.cuh @@ -23,6 +23,12 @@ #include "xgboost/context.h" // Context #include "xgboost/span.h" // Span +#ifdef XGBOOST_USE_HIP +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { namespace common { namespace detail { @@ -217,13 +223,8 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b auto scan_val = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), detail::WeightOp{w_begin, d_sorted_idx}); -#if defined(XGBOOST_USE_CUDA) thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights, scan_val, weights_cdf.begin()); -#elif defined(XGBOOST_USE_HIP) - thrust::inclusive_scan_by_key(thrust::hip::par(caching), scan_key, scan_key + n_weights, - scan_val, weights_cdf.begin()); -#endif auto n_segments = std::distance(seg_beg, seg_end) - 1; quantiles->SetDevice(ctx->Device()); diff --git a/src/data/data.cu b/src/data/data.cu index 9c0c02b24..39c44954c 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -17,6 +17,9 @@ #if defined(XGBOOST_USE_HIP) namespace cub = hipcub; +namespace thrust { + namespace cuda = thrust::hip; +} #endif namespace xgboost { @@ -122,13 +125,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector* p_ group_ptr_.resize(h_num_runs_out + 1, 0); dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) thrust::inclusive_scan(thrust::cuda::par(alloc), cnt.begin(), cnt.begin() + h_num_runs_out, cnt.begin()); -#elif defined(XGBOOST_USE_HIP) - thrust::inclusive_scan(thrust::hip::par(alloc), cnt.begin(), - cnt.begin() + h_num_runs_out, cnt.begin()); -#endif thrust::copy(cnt.begin(), cnt.begin() + h_num_runs_out, group_ptr_.begin() + 1); diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh index ac19d47e4..b1c18ac6a 100644 --- a/src/data/device_adapter.cuh +++ b/src/data/device_adapter.cuh @@ -17,6 +17,12 @@ #include "adapter.h" #include "array_interface.h" +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { namespace data { @@ -246,17 +252,10 @@ std::size_t GetRowCounts(const AdapterBatchT batch, common::Span offs }); dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) bst_row_t row_stride = dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()), thrust::device_pointer_cast(offset.data()) + offset.size(), static_cast(0), thrust::maximum()); -#elif defined(XGBOOST_USE_HIP) - bst_row_t row_stride = - dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()), - thrust::device_pointer_cast(offset.data()) + offset.size(), - static_cast(0), thrust::maximum()); -#endif return row_stride; } @@ -280,13 +279,8 @@ bool NoInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) { // intervals to early stop. But we expect all data to be valid here, using small // intervals only decreases performance due to excessive kernel launch and stream // synchronization. -#if defined(XGBOOST_USE_CUDA) auto valid = dh::Reduce(thrust::cuda::par(alloc), value_iter, value_iter + batch.Size(), true, thrust::logical_and<>{}); -#elif defined(XGBOOST_USE_HIP) - auto valid = dh::Reduce(thrust::hip::par(alloc), value_iter, value_iter + batch.Size(), true, - thrust::logical_and<>{}); -#endif return valid; } }; // namespace data diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 68a58fd60..cc09356c4 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -16,6 +16,12 @@ #include "simple_batch_iterator.h" #include "sparse_page_source.h" +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost::data { void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, DataIterHandle iter_handle, float missing, @@ -86,11 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, return GetRowCounts(value, row_counts_span, get_device(), missing); })); -#if defined(XGBOOST_USE_CUDA) nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end()); -#elif defined(XGBOOST_USE_HIP) - nnz += thrust::reduce(thrust::hip::par(alloc), row_counts.begin(), row_counts.end()); -#endif batches++; } while (iter.Next()); diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh index 01e532d01..a862ed23d 100644 --- a/src/data/simple_dmatrix.cuh +++ b/src/data/simple_dmatrix.cuh @@ -13,6 +13,12 @@ #include "../common/error_msg.h" // for InfInData #include "device_adapter.cuh" // for HasInfInData +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost::data { #if defined(XGBOOST_USE_CUDA) @@ -69,15 +75,9 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span offset, }); dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()), thrust::device_pointer_cast(offset.data() + offset.size()), thrust::device_pointer_cast(offset.data())); -#elif defined(XGBOOST_USE_HIP) - thrust::exclusive_scan(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()), - thrust::device_pointer_cast(offset.data() + offset.size()), - thrust::device_pointer_cast(offset.data())); -#endif } template diff --git a/src/metric/auc.cu b/src/metric/auc.cu index abbc4e944..d2194034e 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -25,6 +25,12 @@ #include "xgboost/data.h" #include "xgboost/span.h" +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { namespace metric { // tag the this file, used by force static link later. @@ -127,17 +133,10 @@ GPUBinaryAUC(common::Span predts, MetaInfo const &info, thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; }); -#if defined(XGBOOST_USE_CUDA) auto end_unique = thrust::unique_by_key_copy( thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(), dh::tbegin(d_unique_idx), thrust::make_discard_iterator(), dh::tbegin(d_unique_idx)); -#elif defined(XGBOOST_USE_HIP) - auto end_unique = thrust::unique_by_key_copy( - thrust::hip::par(alloc), uni_key, uni_key + d_sorted_idx.size(), - dh::tbegin(d_unique_idx), thrust::make_discard_iterator(), - dh::tbegin(d_unique_idx)); -#endif d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx)); @@ -179,11 +178,7 @@ GPUBinaryAUC(common::Span predts, MetaInfo const &info, Pair last = cache->fptp.back(); -#if defined(XGBOOST_USE_CUDA) double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size()); -#elif defined(XGBOOST_USE_HIP) - double auc = thrust::reduce(thrust::hip::par(alloc), in, in + d_unique_idx.size()); -#endif return std::make_tuple(last.first, last.second, auc); } @@ -239,15 +234,9 @@ double ScaleClasses(common::Span results, common::Span local_are double tp_sum; double auc_sum; -#if defined(XGBOOST_USE_CUDA) thrust::tie(auc_sum, tp_sum) = thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes, Pair{0.0, 0.0}, PairPlus{}); -#elif defined(XGBOOST_USE_HIP) - thrust::tie(auc_sum, tp_sum) = - thrust::reduce(thrust::hip::par(alloc), reduce_in, reduce_in + n_classes, - Pair{0.0, 0.0}, PairPlus{}); -#endif if (tp_sum != 0 && !std::isnan(auc_sum)) { auc_sum /= tp_sum; @@ -329,15 +318,9 @@ void SegmentedReduceAUC(common::Span d_unique_idx, return auc; }); -#if defined(XGBOOST_USE_CUDA) thrust::reduce_by_key(thrust::cuda::par(alloc), key_in, key_in + d_unique_idx.size(), val_in, thrust::make_discard_iterator(), dh::tbegin(d_auc)); -#elif defined(XGBOOST_USE_HIP) - thrust::reduce_by_key(thrust::hip::par(alloc), key_in, - key_in + d_unique_idx.size(), val_in, - thrust::make_discard_iterator(), dh::tbegin(d_auc)); -#endif } /** @@ -410,7 +393,6 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, dh::TemporaryArray unique_class_ptr(d_class_ptr.size()); auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr); -#if defined(XGBOOST_USE_CUDA) auto n_uniques = dh::SegmentedUniqueByKey( thrust::cuda::par(alloc), dh::tbegin(d_class_ptr), @@ -421,18 +403,6 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, d_unique_class_ptr.data(), dh::tbegin(d_unique_idx), thrust::equal_to>{}); -#elif defined(XGBOOST_USE_HIP) - auto n_uniques = dh::SegmentedUniqueByKey( - thrust::hip::par(alloc), - dh::tbegin(d_class_ptr), - dh::tend(d_class_ptr), - uni_key, - uni_key + d_sorted_idx.size(), - dh::tbegin(d_unique_idx), - d_unique_class_ptr.data(), - dh::tbegin(d_unique_idx), - thrust::equal_to>{}); -#endif d_unique_idx = d_unique_idx.subspan(0, n_uniques); @@ -553,15 +523,9 @@ std::pair GPURankingAUC(Context const *ctx, common::Span< thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; }); -#if defined(XGBOOST_USE_CUDA) size_t n_valid = thrust::count_if( thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1, [=] XGBOOST_DEVICE(size_t len) { return len >= 3; }); -#elif defined(XGBOOST_USE_HIP) - size_t n_valid = thrust::count_if( - thrust::hip::par(alloc), check_it, check_it + group_ptr.size() - 1, - [=] XGBOOST_DEVICE(size_t len) { return len >= 3; }); -#endif if (n_valid < info.group_ptr_.size() - 1) { InvalidGroupAUC(); @@ -659,13 +623,8 @@ std::pair GPURankingAUC(Context const *ctx, common::Span< /** * Scale the AUC with number of items in each group. */ -#if defined(XGBOOST_USE_CUDA) double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc), dh::tend(s_d_auc), 0.0); -#elif defined(XGBOOST_USE_HIP) - double auc = thrust::reduce(thrust::hip::par(alloc), dh::tbegin(s_d_auc), - dh::tend(s_d_auc), 0.0); -#endif return std::make_pair(auc, n_valid); } @@ -694,15 +653,9 @@ std::tuple GPUBinaryPRAUC(common::Span pred dh::XGBCachingDeviceAllocator alloc; double total_pos, total_neg; -#if defined(XGBOOST_USE_CUDA) thrust::tie(total_pos, total_neg) = thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(), Pair{0.0, 0.0}, PairPlus{}); -#elif defined(XGBOOST_USE_HIP) - thrust::tie(total_pos, total_neg) = - thrust::reduce(thrust::hip::par(alloc), it, it + labels.Size(), - Pair{0.0, 0.0}, PairPlus{}); -#endif if (total_pos <= 0.0 || total_neg <= 0.0) { return {0.0f, 0.0f, 0.0f}; @@ -755,17 +708,10 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span predts, }); dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + predts.size(), val_it, thrust::make_discard_iterator(), totals.begin(), thrust::equal_to{}, PairPlus{}); -#elif defined(XGBOOST_USE_HIP) - thrust::reduce_by_key(thrust::hip::par(alloc), key_it, - key_it + predts.size(), val_it, - thrust::make_discard_iterator(), totals.begin(), - thrust::equal_to{}, PairPlus{}); -#endif /** * Calculate AUC @@ -834,7 +780,6 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, dh::TemporaryArray unique_class_ptr(d_group_ptr.size()); auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr); -#if defined(XGBOOST_USE_CUDA) auto n_uniques = dh::SegmentedUniqueByKey( thrust::cuda::par(alloc), dh::tbegin(d_group_ptr), @@ -845,18 +790,6 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, d_unique_class_ptr.data(), dh::tbegin(d_unique_idx), thrust::equal_to>{}); -#elif defined(XGBOOST_USE_HIP) - auto n_uniques = dh::SegmentedUniqueByKey( - thrust::hip::par(alloc), - dh::tbegin(d_group_ptr), - dh::tend(d_group_ptr), - uni_key, - uni_key + d_sorted_idx.size(), - dh::tbegin(d_unique_idx), - d_unique_class_ptr.data(), - dh::tbegin(d_unique_idx), - thrust::equal_to>{}); -#endif d_unique_idx = d_unique_idx.subspan(0, n_uniques); @@ -909,15 +842,9 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, return thrust::make_pair(0.0, static_cast(1)); }); -#if defined(XGBOOST_USE_CUDA) thrust::tie(auc, invalid_groups) = thrust::reduce( thrust::cuda::par(alloc), it, it + n_groups, thrust::pair(0.0, 0), PairPlus{}); -#elif defined(XGBOOST_USE_HIP) - thrust::tie(auc, invalid_groups) = thrust::reduce( - thrust::hip::par(alloc), it, it + n_groups, - thrust::pair(0.0, 0), PairPlus{}); -#endif } return std::make_pair(auc, n_groups - invalid_groups); } @@ -949,17 +876,10 @@ std::pair GPURankingPRAUC(Context const *ctx, dh::XGBDeviceAllocator alloc; auto labels = info.labels.View(ctx->Device()); -#if defined(XGBOOST_USE_CUDA) if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()), dh::tend(labels.Values()), PRAUCLabelInvalid{})) { InvalidLabels(); } -#elif defined(XGBOOST_USE_HIP) - if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()), - dh::tend(labels.Values()), PRAUCLabelInvalid{})) { - InvalidLabels(); - } -#endif /** * Get total positive/negative for each group. @@ -981,17 +901,10 @@ std::pair GPURankingPRAUC(Context const *ctx, return thrust::make_pair(y * w, (1.0 - y) * w); }); -#if defined(XGBOOST_USE_CUDA) thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + predts.size(), val_it, thrust::make_discard_iterator(), totals.begin(), thrust::equal_to{}, PairPlus{}); -#elif defined(XGBOOST_USE_HIP) - thrust::reduce_by_key(thrust::hip::par(alloc), key_it, - key_it + predts.size(), val_it, - thrust::make_discard_iterator(), totals.begin(), - thrust::equal_to{}, PairPlus{}); -#endif /** * Calculate AUC diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index f52b28fd1..eb766e964 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -30,6 +30,12 @@ #include "../common/device_helpers.cuh" #endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { namespace metric { // tag the this file, used by force static link later. @@ -47,7 +53,7 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) { PackedReduceResult result; auto labels = info.labels.View(ctx->Device()); if (ctx->IsCUDA()) { -#if defined(XGBOOST_USE_CUDA) +#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) dh::XGBCachingDeviceAllocator alloc; thrust::counting_iterator begin(0); thrust::counting_iterator end = begin + labels.Size(); @@ -62,21 +68,6 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) { return PackedReduceResult{v, wt}; }, PackedReduceResult{}, thrust::plus()); -#elif defined(XGBOOST_USE_HIP) - dh::XGBCachingDeviceAllocator alloc; - thrust::counting_iterator begin(0); - thrust::counting_iterator end = begin + labels.Size(); - result = thrust::transform_reduce( - thrust::hip::par(alloc), begin, end, - [=] XGBOOST_DEVICE(size_t i) { - auto idx = linalg::UnravelIndex(i, labels.Shape()); - auto sample_id = std::get<0>(idx); - auto target_id = std::get<1>(idx); - auto res = loss(i, sample_id, target_id); - float v{std::get<0>(res)}, wt{std::get<1>(res)}; - return PackedReduceResult{v, wt}; - }, - PackedReduceResult{}, thrust::plus()); #else common::AssertGPUSupport(); #endif // defined(XGBOOST_USE_CUDA) diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu index 6e9019488..e8f71dfd4 100644 --- a/src/metric/multiclass_metric.cu +++ b/src/metric/multiclass_metric.cu @@ -24,6 +24,12 @@ #include "../common/device_helpers.cuh" #endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { namespace metric { // tag the this file, used by force static link later. @@ -104,7 +110,6 @@ class MultiClassMetricsReduction { dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) PackedReduceResult result = thrust::transform_reduce( thrust::cuda::par(alloc), begin, end, @@ -122,25 +127,6 @@ class MultiClassMetricsReduction { }, PackedReduceResult(), thrust::plus()); -#elif defined(XGBOOST_USE_HIP) - PackedReduceResult result = thrust::transform_reduce( - thrust::hip::par(alloc), - begin, end, - [=] XGBOOST_DEVICE(size_t idx) { - bst_float weight = is_null_weight ? 1.0f : s_weights[idx]; - bst_float residue = 0; - auto label = static_cast(s_labels[idx]); - if (label >= 0 && label < static_cast(n_class)) { - residue = EvalRowPolicy::EvalRow( - label, &s_preds[idx * n_class], n_class) * weight; - } else { - s_label_error[0] = label; - } - return PackedReduceResult{ residue, weight }; - }, - PackedReduceResult(), - thrust::plus()); -#endif CheckLabelError(s_label_error[0], n_class); diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu index b501bed76..19c1891e3 100644 --- a/src/metric/survival_metric.cu +++ b/src/metric/survival_metric.cu @@ -25,6 +25,12 @@ #include "../common/device_helpers.cuh" #endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + using AFTParam = xgboost::common::AFTParam; using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType; template @@ -103,7 +109,6 @@ class ElementWiseSurvivalMetricsReduction { dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) PackedReduceResult result = thrust::transform_reduce( thrust::cuda::par(alloc), begin, end, @@ -118,22 +123,6 @@ class ElementWiseSurvivalMetricsReduction { }, PackedReduceResult(), thrust::plus()); -#elif defined(XGBOOST_USE_HIP) - PackedReduceResult result = thrust::transform_reduce( - thrust::hip::par(alloc), - begin, end, - [=] XGBOOST_DEVICE(size_t idx) { - double weight = is_null_weight ? 1.0 : static_cast(s_weights[idx]); - double residue = d_policy.EvalRow( - static_cast(s_label_lower_bound[idx]), - static_cast(s_label_upper_bound[idx]), - static_cast(s_preds[idx])); - residue *= weight; - return PackedReduceResult{residue, weight}; - }, - PackedReduceResult(), - thrust::plus()); -#endif return result; } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 4a75903b7..89506a86b 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -29,6 +29,12 @@ #include "xgboost/tree_model.h" #include "xgboost/tree_updater.h" +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost::predictor { DMLC_REGISTRY_FILE_TAG(gpu_predictor); @@ -512,7 +518,6 @@ void ExtractPaths( return PathInfo{static_cast(idx), path_length, tree_idx}; }); -#if defined(XGBOOST_USE_CUDA) auto end = thrust::copy_if( thrust::cuda::par(alloc), nodes_transform, nodes_transform + d_nodes.size(), info.begin(), @@ -525,20 +530,6 @@ void ExtractPaths( thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator, length_iterator + info.size() + 1, path_segments.begin()); -#elif defined(XGBOOST_USE_HIP) - auto end = thrust::copy_if( - thrust::hip::par(alloc), nodes_transform, - nodes_transform + d_nodes.size(), info.begin(), - [=] __device__(const PathInfo& e) { return e.leaf_position != -1; }); - info.resize(end - info.begin()); - auto length_iterator = dh::MakeTransformIterator( - info.begin(), - [=] __device__(const PathInfo& info) { return info.length; }); - dh::caching_device_vector path_segments(info.size() + 1); - thrust::exclusive_scan(thrust::hip::par(alloc), length_iterator, - length_iterator + info.size() + 1, - path_segments.begin()); -#endif paths->resize(path_segments.back()); diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu index 8bbb62a29..2b0a248ce 100644 --- a/src/tree/fit_stump.cu +++ b/src/tree/fit_stump.cu @@ -21,6 +21,12 @@ #include "xgboost/logging.h" // CHECK_EQ #include "xgboost/span.h" // span +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost::tree::cuda_impl { void FitStump(Context const* ctx, MetaInfo const& info, linalg::TensorView gpair, linalg::VectorView out) { @@ -45,11 +51,7 @@ void FitStump(Context const* ctx, MetaInfo const& info, dh::XGBCachingDeviceAllocator alloc; -#if defined(XGBOOST_USE_CUDA) auto policy = thrust::cuda::par(alloc); -#elif defined(XGBOOST_USE_HIP) - auto policy = thrust::hip::par(alloc); -#endif thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it, thrust::make_discard_iterator(), dh::tbegin(d_sum.Values())); diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index e4ca29c97..5d00640a4 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -12,6 +12,12 @@ #include "evaluate_splits.cuh" #include "xgboost/data.h" +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost::tree { void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span ft, bst_feature_t n_features, TrainParam const ¶m, @@ -28,7 +34,6 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span GPUHistEvaluator::SortHistogram( auto d_feature_idx = dh::ToSpan(feature_idx_); auto total_bins = shared_inputs.feature_values.size(); -#if defined(XGBOOST_USE_CUDA) thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data), [=] XGBOOST_DEVICE(uint32_t i) { auto const &input = d_inputs[i / total_bins]; @@ -115,27 +99,11 @@ common::Span GPUHistEvaluator::SortHistogram( } return thrust::make_tuple(i, 0.0f); }); -#elif defined(XGBOOST_USE_HIP) - thrust::transform(thrust::hip::par(alloc), it, it + data.size(), dh::tbegin(data), - [=] XGBOOST_DEVICE(uint32_t i) { - auto const &input = d_inputs[i / total_bins]; - auto j = i % total_bins; - auto fidx = d_feature_idx[j]; - if (common::IsCat(shared_inputs.feature_types, fidx)) { - auto grad = - shared_inputs.rounding.ToFloatingPoint(input.gradient_histogram[j]); - auto lw = evaluator.CalcWeightCat(shared_inputs.param, grad); - return thrust::make_tuple(i, lw); - } - return thrust::make_tuple(i, 0.0f); - }); -#endif // Sort an array segmented according to // - nodes // - features within each node // - gradients within each feature -#if defined(XGBOOST_USE_CUDA) thrust::stable_sort_by_key(thrust::cuda::par(alloc), dh::tbegin(data), dh::tend(data), dh::tbegin(sorted_idx), [=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) { @@ -166,38 +134,6 @@ common::Span GPUHistEvaluator::SortHistogram( } return li < ri; }); -#elif defined(XGBOOST_USE_HIP) - thrust::stable_sort_by_key(thrust::hip::par(alloc), dh::tbegin(data), dh::tend(data), - dh::tbegin(sorted_idx), - [=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) { - auto li = thrust::get<0>(l); - auto ri = thrust::get<0>(r); - - auto l_node = li / total_bins; - auto r_node = ri / total_bins; - - if (l_node != r_node) { - return l_node < r_node; // not the same node - } - - li = li % total_bins; - ri = ri % total_bins; - - auto lfidx = d_feature_idx[li]; - auto rfidx = d_feature_idx[ri]; - - if (lfidx != rfidx) { - return lfidx < rfidx; // not the same feature - } - - if (common::IsCat(shared_inputs.feature_types, lfidx)) { - auto lw = thrust::get<1>(l); - auto rw = thrust::get<1>(r); - return lw < rw; - } - return li < ri; - }); -#endif return dh::ToSpan(cat_sorted_idx_); } } // namespace xgboost::tree diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index e52977065..64e665afc 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -16,6 +16,12 @@ #include "row_partitioner.cuh" #include "xgboost/base.h" +#if defined(XGBOOST_USE_HIP) +namespace thrust { + namespace cuda = thrust::hip; +} +#endif + namespace xgboost { namespace tree { namespace { @@ -60,13 +66,8 @@ GradientQuantiser::GradientQuantiser(common::Span gpair, Met thrust::device_ptr gpair_beg{gpair.data()}; auto beg = thrust::make_transform_iterator(gpair_beg, Clip()); -#if defined(XGBOOST_USE_CUDA) Pair p = dh::Reduce(thrust::cuda::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus{}); -#elif defined(XGBOOST_USE_HIP) - Pair p = - dh::Reduce(thrust::hip::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus{}); -#endif // Treat pair as array of 4 primitive types to allreduce using ReduceT = typename decltype(p.first)::ValueT;