/** * Copyright 2022-2023 by XGBoost Contributors */ #ifndef XGBOOST_COMMON_STATS_CUH_ #define XGBOOST_COMMON_STATS_CUH_ #include // lower_bound #include // for_each_n #include // make_constant_iterator #include // make_counting_iterator #include // make_permutation_iterator #include // inclusive_scan_by_key #include // std::min #include // std::size_t #include // std::distance #include // std::numeric_limits #include // std::is_floating_point,std::iterator_traits #include "algorithm.cuh" // SegmentedArgMergeSort #include "cuda_context.cuh" // CUDAContext #include "device_helpers.cuh" #include "xgboost/context.h" // Context #include "xgboost/span.h" // Span namespace xgboost { namespace common { namespace detail { // This should be a lambda function, but for some reason gcc-11 + nvcc-11.8 failed to // compile it. As a result, a functor is extracted instead. // // error: ‘__T288’ was not declared in this scope template struct QuantileSegmentOp { SegIt seg_begin; ValIt val; AlphaIt alpha_it; Span d_results; static_assert(std::is_floating_point::value_type>::value, "Invalid value for quantile."); static_assert(std::is_floating_point::value_type>::value, "Invalid alpha."); XGBOOST_DEVICE void operator()(std::size_t seg_idx) { std::size_t begin = seg_begin[seg_idx]; auto n = static_cast(seg_begin[seg_idx + 1] - begin); double a = alpha_it[seg_idx]; if (n == 0) { d_results[seg_idx] = std::numeric_limits::quiet_NaN(); return; } if (a <= (1 / (n + 1))) { d_results[seg_idx] = val[begin]; return; } if (a >= (n / (n + 1))) { d_results[seg_idx] = val[common::LastOf(seg_idx, seg_begin)]; return; } double x = a * static_cast(n + 1); double k = std::floor(x) - 1; double d = (x - 1) - k; auto v0 = val[begin + static_cast(k)]; auto v1 = val[begin + static_cast(k) + 1]; d_results[seg_idx] = v0 + d * (v1 - v0); } }; template auto MakeQSegOp(SegIt seg_it, ValIt val_it, AlphaIt alpha_it, Span d_results) { return QuantileSegmentOp{seg_it, val_it, alpha_it, d_results}; } template struct SegOp { SegIt seg_beg; SegIt seg_end; XGBOOST_DEVICE std::size_t operator()(std::size_t i) { return dh::SegmentId(seg_beg, seg_end, i); } }; template struct WeightOp { WIter w_begin; Span d_sorted_idx; XGBOOST_DEVICE float operator()(std::size_t i) { return w_begin[d_sorted_idx[i]]; } }; template struct WeightedQuantileSegOp { AlphaIt alpha_it; SegIt seg_beg; ValIt val_begin; Span d_weight_cdf; Span d_sorted_idx; Span d_results; static_assert(std::is_floating_point::value_type>::value, "Invalid alpha."); static_assert(std::is_floating_point::value_type>::value, "Invalid value for quantile."); XGBOOST_DEVICE void operator()(std::size_t seg_idx) { std::size_t begin = seg_beg[seg_idx]; auto n = static_cast(seg_beg[seg_idx + 1] - begin); if (n == 0) { d_results[seg_idx] = std::numeric_limits::quiet_NaN(); return; } auto seg_cdf = d_weight_cdf.subspan(begin, static_cast(n)); auto seg_sorted_idx = d_sorted_idx.subspan(begin, static_cast(n)); double a = alpha_it[seg_idx]; double thresh = seg_cdf.back() * a; std::size_t idx = thrust::lower_bound(thrust::seq, seg_cdf.data(), seg_cdf.data() + seg_cdf.size(), thresh) - seg_cdf.data(); idx = std::min(idx, static_cast(n - 1)); d_results[seg_idx] = val_begin[seg_sorted_idx[idx]]; } }; template auto MakeWQSegOp(SegIt seg_it, ValIt val_it, AlphaIt alpha_it, Span d_weight_cdf, Span d_sorted_idx, Span d_results) { return WeightedQuantileSegOp{alpha_it, seg_it, val_it, d_weight_cdf, d_sorted_idx, d_results}; } } // namespace detail /** * @brief Compute segmented quantile on GPU. * * @tparam SegIt Iterator for CSR style segments indptr * @tparam ValIt Iterator for values * @tparam AlphaIt Iterator to alphas * * @param alpha The p^th quantile we want to compute, one for each segment. * * std::distance(seg_begin, seg_end) should be equal to n_segments + 1 */ template ::value>* = nullptr> void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, SegIt seg_end, ValIt val_begin, ValIt val_end, HostDeviceVector* quantiles) { dh::device_vector sorted_idx; using Tup = thrust::tuple; common::SegmentedArgMergeSort(ctx, seg_begin, seg_end, val_begin, val_end, &sorted_idx); auto n_segments = std::distance(seg_begin, seg_end) - 1; if (n_segments <= 0) { return; } auto d_sorted_idx = dh::ToSpan(sorted_idx); auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx)); quantiles->SetDevice(ctx->gpu_id); quantiles->Resize(n_segments); auto d_results = quantiles->DeviceSpan(); dh::LaunchN(n_segments, ctx->CUDACtx()->Stream(), detail::MakeQSegOp(seg_begin, val, alpha_it, d_results)); } /** * @brief Compute segmented quantile on GPU. * * @tparam SegIt Iterator for CSR style segments indptr * @tparam ValIt Iterator for values * * @param alpha The p^th quantile we want to compute * * std::distance(ptr_begin, ptr_end) should be equal to n_segments + 1 */ template void SegmentedQuantile(Context const* ctx, double alpha, SegIt seg_begin, SegIt seg_end, ValIt val_begin, ValIt val_end, HostDeviceVector* quantiles) { CHECK(alpha >= 0 && alpha <= 1); auto alpha_it = thrust::make_constant_iterator(alpha); return SegmentedQuantile(ctx, alpha_it, seg_begin, seg_end, val_begin, val_end, quantiles); } /** * @brief Compute segmented quantile on GPU with weighted inputs. * * @tparam SegIt Iterator for CSR style segments indptr * @tparam ValIt Iterator for values * @tparam WIter Iterator for weights * * @param alpha_it Iterator for the p^th quantile we want to compute, one per-segment * @param w_begin Iterator for weight for each input element */ template ::value_type, void>::value>* = nullptr> void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_beg, SegIt seg_end, ValIt val_begin, ValIt val_end, WIter w_begin, WIter w_end, HostDeviceVector* quantiles) { auto cuctx = ctx->CUDACtx(); dh::device_vector sorted_idx; common::SegmentedArgMergeSort(ctx, seg_beg, seg_end, val_begin, val_end, &sorted_idx); auto d_sorted_idx = dh::ToSpan(sorted_idx); std::size_t n_weights = std::distance(w_begin, w_end); dh::device_vector weights_cdf(n_weights); std::size_t n_elems = std::distance(val_begin, val_end); CHECK_EQ(n_weights, n_elems); dh::XGBCachingDeviceAllocator caching; auto scan_key = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), detail::SegOp{seg_beg, seg_end}); auto scan_val = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), detail::WeightOp{w_begin, d_sorted_idx}); thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights, scan_val, weights_cdf.begin()); auto n_segments = std::distance(seg_beg, seg_end) - 1; quantiles->SetDevice(ctx->gpu_id); quantiles->Resize(n_segments); auto d_results = quantiles->DeviceSpan(); auto d_weight_cdf = dh::ToSpan(weights_cdf); thrust::for_each_n( cuctx->CTP(), thrust::make_counting_iterator(0ul), n_segments, detail::MakeWQSegOp(seg_beg, val_begin, alpha_it, d_weight_cdf, d_sorted_idx, d_results)); } template void SegmentedWeightedQuantile(Context const* ctx, double alpha, SegIt seg_beg, SegIt seg_end, ValIt val_begin, ValIt val_end, WIter w_begin, WIter w_end, HostDeviceVector* quantiles) { CHECK(alpha >= 0 && alpha <= 1); return SegmentedWeightedQuantile(ctx, thrust::make_constant_iterator(alpha), seg_beg, seg_end, val_begin, val_end, w_begin, w_end, quantiles); } } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_STATS_CUH_