/** * Copyright 2015-2023 by XGBoost contributors * * \brief CUDA implementation of lambdarank. */ #include // for fill_n #include // for for_each_n #include // for make_counting_iterator #include // for make_zip_iterator #include // for make_tuple, tuple, tie, get #include // for min #include // for assert #include // for abs, log2, isinf #include // for size_t #include // for int32_t #include // for shared_ptr #include #include "../common/algorithm.cuh" // for SegmentedArgSort #include "../common/cuda_context.cuh" // for CUDAContext #include "../common/deterministic.cuh" // for CreateRoundingFactor, TruncateWithRounding #include "../common/device_helpers.cuh" // for SegmentId, TemporaryArray, AtomicAddGpair #include "../common/optional_weight.h" // for MakeOptionalWeights #include "../common/ranking_utils.h" // for NDCGCache, LambdaRankParam, rel_degree_t #include "lambdarank_obj.cuh" #include "lambdarank_obj.h" #include "xgboost/base.h" // for bst_group_t, XGBOOST_DEVICE, GradientPair #include "xgboost/context.h" // for Context #include "xgboost/data.h" // for MetaInfo #include "xgboost/host_device_vector.h" // for HostDeviceVector #include "xgboost/linalg.h" // for VectorView, Range, Vector #include "xgboost/logging.h" #include "xgboost/span.h" // for Span namespace xgboost::obj { DMLC_REGISTRY_FILE_TAG(lambdarank_obj_cu); namespace cuda_impl { namespace { /** * \brief Calculate minimum value of bias for floating point truncation. */ void MinBias(Context const* ctx, std::shared_ptr p_cache, linalg::VectorView t_plus, linalg::VectorView tj_minus, common::Span d_min) { CHECK_EQ(d_min.size(), 2); auto cuctx = ctx->CUDACtx(); auto k = t_plus.Size(); auto const& p = p_cache->Param(); CHECK_GT(k, 0); CHECK_EQ(k, p_cache->MaxPositionSize()); auto key_it = dh::MakeTransformIterator( thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return i * k; }); auto val_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { if (i >= k) { return std::abs(tj_minus(i - k)); } return std::abs(t_plus(i)); }); std::size_t bytes; cub::DeviceSegmentedReduce::Min(nullptr, bytes, val_it, d_min.data(), 2, key_it, key_it + 1, cuctx->Stream()); dh::TemporaryArray temp(bytes); cub::DeviceSegmentedReduce::Min(temp.data().get(), bytes, val_it, d_min.data(), 2, key_it, key_it + 1, cuctx->Stream()); } /** * \brief Type for gradient statistic. (Gradient, cost for unbiased LTR, normalization factor) */ using GradCostNorm = thrust::tuple; /** * \brief Obtain and update the gradient for one pair. */ template struct GetGradOp { MakePairsOp make_pair; Delta delta; bool need_update; auto __device__ operator()(std::size_t idx) -> GradCostNorm { auto const& args = make_pair.args; auto g = dh::SegmentId(args.d_threads_group_ptr, idx); auto data_group_begin = static_cast(args.d_group_ptr[g]); std::size_t n_data = args.d_group_ptr[g + 1] - data_group_begin; // obtain group segment data. auto g_label = args.labels.Slice(linalg::Range(data_group_begin, data_group_begin + n_data), 0); auto g_predt = args.predts.subspan(data_group_begin, n_data); auto g_gpair = args.gpairs.Slice(linalg::Range(data_group_begin, data_group_begin + n_data)); auto g_rank = args.d_sorted_idx.subspan(data_group_begin, n_data); auto [i, j] = make_pair(idx, g); std::size_t rank_high = i, rank_low = j; if (g_label(g_rank[i]) == g_label(g_rank[j])) { return thrust::make_tuple(GradientPair{}, 0.0, 0.0); } if (g_label(g_rank[i]) < g_label(g_rank[j])) { thrust::swap(rank_high, rank_low); } double cost{0}; auto delta_op = [&](auto const&... args) { return delta(args..., g); }; GradientPair pg = LambdaGrad(g_label, g_predt, g_rank, rank_high, rank_low, delta_op, args.ti_plus, args.tj_minus, &cost); std::size_t idx_high = g_rank[rank_high]; std::size_t idx_low = g_rank[rank_low]; if (need_update) { // second run, update the gradient auto ng = Repulse(pg); auto gr = args.d_roundings(g); // positive gradient truncated auto pgt = GradientPair{common::TruncateWithRounding(gr.GetGrad(), pg.GetGrad()), common::TruncateWithRounding(gr.GetHess(), pg.GetHess())}; // negative gradient truncated auto ngt = GradientPair{common::TruncateWithRounding(gr.GetGrad(), ng.GetGrad()), common::TruncateWithRounding(gr.GetHess(), ng.GetHess())}; dh::AtomicAddGpair(&g_gpair(idx_high), pgt); dh::AtomicAddGpair(&g_gpair(idx_low), ngt); } if (unbiased && need_update) { // second run, update the cost assert(args.tj_minus.Size() == args.ti_plus.Size() && "Invalid size of position bias"); auto g_li = args.li.Slice(linalg::Range(data_group_begin, data_group_begin + n_data)); auto g_lj = args.lj.Slice(linalg::Range(data_group_begin, data_group_begin + n_data)); if (idx_high < args.ti_plus.Size() && idx_low < args.ti_plus.Size()) { if (args.tj_minus(idx_low) >= Eps64()) { // eq.30 atomicAdd(&g_li(idx_high), common::TruncateWithRounding(args.d_cost_rounding[0], cost / args.tj_minus(idx_low))); } if (args.ti_plus(idx_high) >= Eps64()) { // eq.31 atomicAdd(&g_lj(idx_low), common::TruncateWithRounding(args.d_cost_rounding[0], cost / args.ti_plus(idx_high))); } } } return thrust::make_tuple(GradientPair{std::abs(pg.GetGrad()), std::abs(pg.GetHess())}, std::abs(cost), -2.0 * static_cast(pg.GetGrad())); } }; template struct MakeGetGrad { MakePairsOp make_pair; Delta delta; [[nodiscard]] KernelInputs const& Args() const { return make_pair.args; } MakeGetGrad(KernelInputs args, Delta d) : make_pair{args}, delta{std::move(d)} {} GetGradOp operator()(bool need_update) { return GetGradOp{make_pair, delta, need_update}; } }; /** * \brief Calculate gradient for all pairs using update op created by make_get_grad. * * We need to run gradient calculation twice, the first time gathers infomation like * maximum gradient, maximum cost, and the normalization term using reduction. The second * time performs the actual update. * * Without normalization, we only need to run it once since we can manually calculate * the bounds of gradient (NDCG \in [0, 1], delta_NDCG \in [0, 1], ti+/tj- are from the * previous iteration so the bound can be calculated for current iteration). However, if * normalization is used, the delta score is un-bounded and we need to obtain the sum * gradient. As a tradeoff, we simply run the kernel twice, once as reduction, second * one as for_each. * * Alternatively, we can bound the delta score by limiting the output of the model using * sigmoid for binary output and some normalization for multi-level. But effect to the * accuracy is not known yet, and it's only used by GPU. * * For performance, the segmented sort for sorted scores is the bottleneck and takes up * about half of the time, while the reduction and for_each takes up the second half. */ template void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr p_cache, MakeGetGrad make_get_grad) { auto n_groups = p_cache->Groups(); auto d_threads_group_ptr = p_cache->CUDAThreadsGroupPtr(); auto d_gptr = p_cache->DataGroupPtr(ctx); auto d_gpair = make_get_grad.Args().gpairs; /** * First pass, gather info for normalization and rounding factor. */ auto val_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), make_get_grad(false)); auto reduction_op = [] XGBOOST_DEVICE(GradCostNorm const& l, GradCostNorm const& r) -> GradCostNorm { // get maximum gradient for each group, along with cost and the normalization term auto const& lg = thrust::get<0>(l); auto const& rg = thrust::get<0>(r); auto grad = std::max(lg.GetGrad(), rg.GetGrad()); auto hess = std::max(lg.GetHess(), rg.GetHess()); auto cost = std::max(thrust::get<1>(l), thrust::get<1>(r)); double sum_lambda = thrust::get<2>(l) + thrust::get<2>(r); return thrust::make_tuple(GradientPair{std::abs(grad), std::abs(hess)}, cost, sum_lambda); }; auto init = thrust::make_tuple(GradientPair{0.0f, 0.0f}, 0.0, 0.0); common::Span d_max_lambdas = p_cache->MaxLambdas(ctx, n_groups); CHECK_EQ(n_groups * sizeof(GradCostNorm), d_max_lambdas.size_bytes()); std::size_t bytes; cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, d_max_lambdas.data(), n_groups, d_threads_group_ptr.data(), d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream()); dh::TemporaryArray temp(bytes); cub::DeviceSegmentedReduce::Reduce( temp.data().get(), bytes, val_it, d_max_lambdas.data(), n_groups, d_threads_group_ptr.data(), d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream()); dh::TemporaryArray min_bias(2); auto d_min_bias = dh::ToSpan(min_bias); if (unbiased) { MinBias(ctx, p_cache, make_get_grad.Args().ti_plus, make_get_grad.Args().tj_minus, d_min_bias); } /** * Create rounding factors */ auto d_cost_rounding = p_cache->CUDACostRounding(ctx); auto d_rounding = p_cache->CUDARounding(ctx); dh::LaunchN(n_groups, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t g) mutable { auto group_size = d_gptr[g + 1] - d_gptr[g]; auto const& max_grad = thrust::get<0>(d_max_lambdas[g]); // float group size auto fgs = static_cast(group_size); auto grad = common::CreateRoundingFactor(fgs * max_grad.GetGrad(), group_size); auto hess = common::CreateRoundingFactor(fgs * max_grad.GetHess(), group_size); d_rounding(g) = GradientPair{grad, hess}; auto cost = thrust::get<1>(d_max_lambdas[g]); if (unbiased) { cost /= std::min(d_min_bias[0], d_min_bias[1]); d_cost_rounding[0] = common::CreateRoundingFactor(fgs * cost, group_size); } }); /** * Second pass, actual update to gradient and bias. */ thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), p_cache->CUDAThreads(), make_get_grad(true)); /** * Lastly, normalization and weight. */ auto d_weights = common::MakeOptionalWeights(ctx, info.weights_); auto w_norm = p_cache->WeightNorm(); thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.Size(), [=] XGBOOST_DEVICE(std::size_t i) mutable { auto g = dh::SegmentId(d_gptr, i); auto sum_lambda = thrust::get<2>(d_max_lambdas[g]); // Normalization if (sum_lambda > 0.0) { double norm = std::log2(1.0 + sum_lambda) / sum_lambda; d_gpair(i, 0) *= norm; } d_gpair(i, 0) *= (d_weights[g] * w_norm); }); } /** * \brief Handles boilerplate code like getting device span. */ template void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector const& preds, const MetaInfo& info, std::shared_ptr p_cache, Delta delta, linalg::VectorView ti_plus, // input bias ratio linalg::VectorView tj_minus, // input bias ratio linalg::VectorView li, linalg::VectorView lj, linalg::Matrix* out_gpair) { // boilerplate std::int32_t device_id = ctx->gpu_id; dh::safe_cuda(cudaSetDevice(device_id)); auto n_groups = p_cache->Groups(); info.labels.SetDevice(device_id); preds.SetDevice(device_id); out_gpair->SetDevice(ctx->Device()); out_gpair->Reshape(preds.Size(), 1); CHECK(p_cache); auto d_rounding = p_cache->CUDARounding(ctx); auto d_cost_rounding = p_cache->CUDACostRounding(ctx); CHECK_NE(d_rounding.Size(), 0); auto label = info.labels.View(ctx->gpu_id); auto predts = preds.ConstDeviceSpan(); auto gpairs = out_gpair->View(ctx->Device()); thrust::fill_n(ctx->CUDACtx()->CTP(), gpairs.Values().data(), gpairs.Size(), GradientPair{0.0f, 0.0f}); auto const d_threads_group_ptr = p_cache->CUDAThreadsGroupPtr(); auto const d_gptr = p_cache->DataGroupPtr(ctx); auto const rank_idx = p_cache->SortedIdx(ctx, predts); auto const unbiased = p_cache->Param().lambdarank_unbiased; common::Span d_y_sorted_idx; if (!p_cache->Param().HasTruncation()) { d_y_sorted_idx = SortY(ctx, info, rank_idx, p_cache); } KernelInputs args{ti_plus, tj_minus, li, lj, d_gptr, d_threads_group_ptr, rank_idx, label, predts, gpairs, d_rounding, d_cost_rounding.data(), d_y_sorted_idx, iter}; // dispatch based on unbiased and truncation if (p_cache->Param().HasTruncation()) { if (unbiased) { CalcGrad(ctx, info, p_cache, MakeGetGrad{args, delta}); } else { CalcGrad(ctx, info, p_cache, MakeGetGrad{args, delta}); } } else { if (unbiased) { CalcGrad(ctx, info, p_cache, MakeGetGrad{args, delta}); } else { CalcGrad(ctx, info, p_cache, MakeGetGrad{args, delta}); } } } } // anonymous namespace common::Span SortY(Context const* ctx, MetaInfo const& info, common::Span d_rank, std::shared_ptr p_cache) { auto const d_group_ptr = p_cache->DataGroupPtr(ctx); auto label = info.labels.View(ctx->gpu_id); // The buffer for ranked y is necessary as cub segmented sort accepts only pointer. auto d_y_ranked = p_cache->RankedY(ctx, info.num_row_); thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_y_ranked.size(), [=] XGBOOST_DEVICE(std::size_t i) { auto g = dh::SegmentId(d_group_ptr, i); auto g_label = label.Slice(linalg::Range(d_group_ptr[g], d_group_ptr[g + 1]), 0); auto g_rank_idx = d_rank.subspan(d_group_ptr[g], g_label.Size()); i -= d_group_ptr[g]; auto g_y_ranked = d_y_ranked.subspan(d_group_ptr[g], g_label.Size()); g_y_ranked[i] = g_label(g_rank_idx[i]); }); auto d_y_sorted_idx = p_cache->SortedIdxY(ctx, info.num_row_); common::SegmentedArgSort(ctx, d_y_ranked, d_group_ptr, d_y_sorted_idx); return d_y_sorted_idx; } void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter, const HostDeviceVector& preds, const MetaInfo& info, std::shared_ptr p_cache, linalg::VectorView ti_plus, // input bias ratio linalg::VectorView tj_minus, // input bias ratio linalg::VectorView li, linalg::VectorView lj, linalg::Matrix* out_gpair) { // boilerplate std::int32_t device_id = ctx->gpu_id; dh::safe_cuda(cudaSetDevice(device_id)); auto const d_inv_IDCG = p_cache->InvIDCG(ctx); auto const discount = p_cache->Discount(ctx); info.labels.SetDevice(device_id); preds.SetDevice(device_id); auto const exp_gain = p_cache->Param().ndcg_exp_gain; auto delta_ndcg = [=] XGBOOST_DEVICE(float y_high, float y_low, std::size_t rank_high, std::size_t rank_low, bst_group_t g) { return exp_gain ? DeltaNDCG(y_high, y_low, rank_high, rank_low, d_inv_IDCG(g), discount) : DeltaNDCG(y_high, y_low, rank_high, rank_low, d_inv_IDCG(g), discount); }; Launch(ctx, iter, preds, info, p_cache, delta_ndcg, ti_plus, tj_minus, li, lj, out_gpair); } void MAPStat(Context const* ctx, MetaInfo const& info, common::Span d_rank_idx, std::shared_ptr p_cache) { common::Span out_n_rel = p_cache->NumRelevant(ctx); common::Span out_acc = p_cache->Acc(ctx); CHECK_EQ(out_n_rel.size(), info.num_row_); CHECK_EQ(out_acc.size(), info.num_row_); auto group_ptr = p_cache->DataGroupPtr(ctx); auto key_it = dh::MakeTransformIterator( thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> std::size_t { return dh::SegmentId(group_ptr, i); }); auto label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0); auto const* cuctx = ctx->CUDACtx(); { // calculate number of relevant documents auto val_it = dh::MakeTransformIterator( thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> double { auto g = dh::SegmentId(group_ptr, i); auto g_label = label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1])); auto idx_in_group = i - group_ptr[g]; auto g_sorted_idx = d_rank_idx.subspan(group_ptr[g], group_ptr[g + 1] - group_ptr[g]); return static_cast(g_label(g_sorted_idx[idx_in_group])); }); thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + info.num_row_, val_it, out_n_rel.data()); } { // \sum l_k/k auto val_it = dh::MakeTransformIterator( thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> double { auto g = dh::SegmentId(group_ptr, i); auto g_label = label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1])); auto g_sorted_idx = d_rank_idx.subspan(group_ptr[g], group_ptr[g + 1] - group_ptr[g]); auto idx_in_group = i - group_ptr[g]; double rank_in_group = idx_in_group + 1.0; return static_cast(g_label(g_sorted_idx[idx_in_group])) / rank_in_group; }); thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + info.num_row_, val_it, out_acc.data()); } } void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter, HostDeviceVector const& predt, const MetaInfo& info, std::shared_ptr p_cache, linalg::VectorView ti_plus, // input bias ratio linalg::VectorView tj_minus, // input bias ratio linalg::VectorView li, linalg::VectorView lj, linalg::Matrix* out_gpair) { std::int32_t device_id = ctx->gpu_id; dh::safe_cuda(cudaSetDevice(device_id)); info.labels.SetDevice(device_id); predt.SetDevice(device_id); CHECK(p_cache); auto d_predt = predt.ConstDeviceSpan(); auto const d_sorted_idx = p_cache->SortedIdx(ctx, d_predt); MAPStat(ctx, info, d_sorted_idx, p_cache); auto d_n_rel = p_cache->NumRelevant(ctx); auto d_acc = p_cache->Acc(ctx); auto d_gptr = p_cache->DataGroupPtr(ctx).data(); auto delta_map = [=] XGBOOST_DEVICE(float y_high, float y_low, std::size_t rank_high, std::size_t rank_low, bst_group_t g) { if (rank_high > rank_low) { thrust::swap(rank_high, rank_low); thrust::swap(y_high, y_low); } auto cnt = d_gptr[g + 1] - d_gptr[g]; auto g_n_rel = d_n_rel.subspan(d_gptr[g], cnt); auto g_acc = d_acc.subspan(d_gptr[g], cnt); auto d = DeltaMAP(y_high, y_low, rank_high, rank_low, g_n_rel, g_acc); return d; }; Launch(ctx, iter, predt, info, p_cache, delta_map, ti_plus, tj_minus, li, lj, out_gpair); } void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter, HostDeviceVector const& predt, const MetaInfo& info, std::shared_ptr p_cache, linalg::VectorView ti_plus, // input bias ratio linalg::VectorView tj_minus, // input bias ratio linalg::VectorView li, linalg::VectorView lj, linalg::Matrix* out_gpair) { std::int32_t device_id = ctx->gpu_id; dh::safe_cuda(cudaSetDevice(device_id)); info.labels.SetDevice(device_id); predt.SetDevice(device_id); auto d_predt = predt.ConstDeviceSpan(); auto const d_sorted_idx = p_cache->SortedIdx(ctx, d_predt); auto delta = [] XGBOOST_DEVICE(float, float, std::size_t, std::size_t, bst_group_t) { return 1.0; }; Launch(ctx, iter, predt, info, p_cache, delta, ti_plus, tj_minus, li, lj, out_gpair); } namespace { struct ReduceOp { template Tup XGBOOST_DEVICE operator()(Tup const& l, Tup const& r) { return thrust::make_tuple(thrust::get<0>(l) + thrust::get<0>(r), thrust::get<1>(l) + thrust::get<1>(r)); } }; } // namespace void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView li_full, linalg::VectorView lj_full, linalg::Vector* p_ti_plus, linalg::Vector* p_tj_minus, linalg::Vector* p_li, // loss linalg::Vector* p_lj, std::shared_ptr p_cache) { auto const d_group_ptr = p_cache->DataGroupPtr(ctx); auto n_groups = d_group_ptr.size() - 1; auto ti_plus = p_ti_plus->View(ctx->gpu_id); auto tj_minus = p_tj_minus->View(ctx->gpu_id); auto li = p_li->View(ctx->gpu_id); auto lj = p_lj->View(ctx->gpu_id); CHECK_EQ(li.Size(), ti_plus.Size()); auto const& param = p_cache->Param(); auto regularizer = param.Regularizer(); std::size_t k = p_cache->MaxPositionSize(); CHECK_EQ(li.Size(), k); CHECK_EQ(lj.Size(), k); // reduce li_full to li for each group. auto make_iter = [&](linalg::VectorView l_full) { auto l_it = [=] XGBOOST_DEVICE(std::size_t i) { // group index auto g = i % n_groups; // rank is the position within a group, also the segment index auto r = i / n_groups; auto begin = d_group_ptr[g]; std::size_t group_size = d_group_ptr[g + 1] - begin; auto n = std::min(group_size, k); // r can be greater than n since we allocate threads based on truncation level // instead of actual group size. if (r >= n) { return 0.0; } return l_full(r + begin); }; return l_it; }; auto li_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), make_iter(li_full)); auto lj_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), make_iter(lj_full)); // k segments, each segment has size n_groups. auto key_it = dh::MakeTransformIterator( thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return i * n_groups; }); auto val_it = thrust::make_zip_iterator(thrust::make_tuple(li_it, lj_it)); auto out_it = thrust::make_zip_iterator(thrust::make_tuple(li.Values().data(), lj.Values().data())); auto init = thrust::make_tuple(0.0, 0.0); std::size_t bytes; cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, out_it, k, key_it, key_it + 1, ReduceOp{}, init, ctx->CUDACtx()->Stream()); dh::TemporaryArray temp(bytes); cub::DeviceSegmentedReduce::Reduce(temp.data().get(), bytes, val_it, out_it, k, key_it, key_it + 1, ReduceOp{}, init, ctx->CUDACtx()->Stream()); thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), li.Size(), [=] XGBOOST_DEVICE(std::size_t i) mutable { if (li(0) >= Eps64()) { ti_plus(i) = std::pow(li(i) / li(0), regularizer); } if (lj(0) >= Eps64()) { tj_minus(i) = std::pow(lj(i) / lj(0), regularizer); } assert(!isinf(ti_plus(i))); assert(!isinf(tj_minus(i))); }); } } // namespace cuda_impl } // namespace xgboost::obj