Merge branch 'master' into sync-condition-2023Apr11

2023-04-11 19:38:38 +02:00
parent 6825d986fd fe9dff339c
commit 08bc4b0c0f
56 changed files with 1912 additions and 983 deletions
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -123,7 +123,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {

  DMLC_DECLARE_PARAMETER(LambdaRankParam) {
    DMLC_DECLARE_FIELD(lambdarank_pair_method)
-        .set_default(PairMethod::kMean)
+        .set_default(PairMethod::kTopK)
        .add_enum("mean", PairMethod::kMean)
        .add_enum("topk", PairMethod::kTopK)
        .describe("Method for constructing pairs.");
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -112,7 +112,6 @@ class PerGroupWeightPolicy {
    return info.GetWeight(group_id);
  }
 };
-
 }  // anonymous namespace

 namespace xgboost::metric {
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -85,7 +85,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
  size_t n_leaf = nidx.size();
  if (nptr.empty()) {
    std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
+    UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
    return;
  }

@@ -99,39 +99,46 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
  auto h_predt = linalg::MakeTensorView(ctx, predt.ConstHostSpan(), info.num_row_,
                                        predt.Size() / info.num_row_);

-  // loop over each leaf
-  common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
-    auto nidx = h_node_idx[k];
-    CHECK(tree[nidx].IsLeaf());
-    CHECK_LT(k + 1, h_node_ptr.size());
-    size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
-    auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
+  if (!info.IsVerticalFederated() || collective::GetRank() == 0) {
+    // loop over each leaf
+    common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
+      auto nidx = h_node_idx[k];
+      CHECK(tree[nidx].IsLeaf());
+      CHECK_LT(k + 1, h_node_ptr.size());
+      size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
+      auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);

-    auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
-    auto h_weights = linalg::MakeVec(&info.weights_);
+      auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
+      auto h_weights = linalg::MakeVec(&info.weights_);

-    auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
-      auto row_idx = h_row_set[i];
-      return h_labels(row_idx) - h_predt(row_idx, group_idx);
-    });
-    auto w_it = common::MakeIndexTransformIter([&](size_t i) -> float {
-      auto row_idx = h_row_set[i];
-      return h_weights(row_idx);
+      auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
+        auto row_idx = h_row_set[i];
+        return h_labels(row_idx) - h_predt(row_idx, group_idx);
+      });
+      auto w_it = common::MakeIndexTransformIter([&](size_t i) -> float {
+        auto row_idx = h_row_set[i];
+        return h_weights(row_idx);
+      });
+
+      float q{0};
+      if (info.weights_.Empty()) {
+        q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
+      } else {
+        q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
+      }
+      if (std::isnan(q)) {
+        CHECK(h_row_set.empty());
+      }
+      quantiles.at(k) = q;
    });
+  }

-    float q{0};
-    if (info.weights_.Empty()) {
-      q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
-    } else {
-      q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
-    }
-    if (std::isnan(q)) {
-      CHECK(h_row_set.empty());
-    }
-    quantiles.at(k) = q;
-  });
+  if (info.IsVerticalFederated()) {
+    collective::Broadcast(static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float),
+                          0);
+  }

-  UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
+  UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
 }

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -185,7 +185,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos

  if (nptr.Empty()) {
    std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), learning_rate, p_tree);
+    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
  }

  HostDeviceVector<float> quantiles;
@@ -220,7 +220,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                      w_it + d_weights.size(), &quantiles);
  }

-  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), learning_rate, p_tree);
+  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
 }
 }  // namespace detail
 }  // namespace obj
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -36,13 +36,15 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
 }

 inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
-                             float learning_rate, RegTree* p_tree) {
+                             MetaInfo const& info, float learning_rate, RegTree* p_tree) {
  auto& tree = *p_tree;
  auto& quantiles = *p_quantiles;
  auto const& h_node_idx = nidx;

  size_t n_leaf{h_node_idx.size()};
-  collective::Allreduce<collective::Operation::kMax>(&n_leaf, 1);
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kMax>(&n_leaf, 1);
+  }
  CHECK(quantiles.empty() || quantiles.size() == n_leaf);
  if (quantiles.empty()) {
    quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@@ -52,12 +54,16 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
  std::vector<int32_t> n_valids(quantiles.size());
  std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
                 [](float q) { return static_cast<int32_t>(!std::isnan(q)); });
-  collective::Allreduce<collective::Operation::kSum>(n_valids.data(), n_valids.size());
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kSum>(n_valids.data(), n_valids.size());
+  }
  // convert to 0 for all reduce
  std::replace_if(
      quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
  // use the mean value
-  collective::Allreduce<collective::Operation::kSum>(quantiles.data(), quantiles.size());
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kSum>(quantiles.data(), quantiles.size());
+  }
  for (size_t i = 0; i < n_leaf; ++i) {
    if (n_valids[i] > 0) {
      quantiles[i] /= static_cast<float>(n_valids[i]);
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -14,8 +14,7 @@
 #include "xgboost/linalg.h"              // Tensor,Vector
 #include "xgboost/task.h"                // ObjInfo

-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const {
  if (this->Task().task == ObjInfo::kRegression) {
    CheckInitInputs(info);
@@ -31,14 +30,13 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
      ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
  new_obj->LoadConfig(config);
  new_obj->GetGradient(dummy_predt, info, 0, &gpair);
+
  bst_target_t n_targets = this->Targets(info);
  linalg::Vector<float> leaf_weight;
  tree::FitStump(this->ctx_, info, gpair, n_targets, &leaf_weight);
-
  // workaround, we don't support multi-target due to binary model serialization for
  // base margin.
  common::Mean(this->ctx_, leaf_weight, base_score);
  this->PredTransform(base_score->Data());
 }
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
--- a/src/objective/init_estimation.h
+++ b/src/objective/init_estimation.h
@@ -7,8 +7,7 @@
 #include "xgboost/linalg.h"     // Tensor
 #include "xgboost/objective.h"  // ObjFunction

-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 class FitIntercept : public ObjFunction {
  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override;
 };
@@ -20,6 +19,5 @@ inline void CheckInitInputs(MetaInfo const& info) {
        << "Number of weights should be equal to number of data points.";
  }
 }
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
 #endif  // XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2015-2023 by XGBoost contributors
+ *
+ * \brief CUDA implementation of lambdarank.
+ */
+#include <thrust/fill.h>                        // for fill_n
+#include <thrust/for_each.h>                    // for for_each_n
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/iterator/zip_iterator.h>       // for make_zip_iterator
+#include <thrust/tuple.h>                       // for make_tuple, tuple, tie, get
+
+#include <algorithm>                            // for min
+#include <cassert>                              // for assert
+#include <cmath>                                // for abs, log2, isinf
+#include <cstddef>                              // for size_t
+#include <cstdint>                              // for int32_t
+#include <memory>                               // for shared_ptr
+#include <utility>
+
+#include "../common/algorithm.cuh"       // for SegmentedArgSort
+#include "../common/cuda_context.cuh"    // for CUDAContext
+#include "../common/deterministic.cuh"   // for CreateRoundingFactor, TruncateWithRounding
+#include "../common/device_helpers.cuh"  // for SegmentId, TemporaryArray, AtomicAddGpair
+#include "../common/optional_weight.h"   // for MakeOptionalWeights
+#include "../common/ranking_utils.h"     // for NDCGCache, LambdaRankParam, rel_degree_t
+#include "lambdarank_obj.cuh"
+#include "lambdarank_obj.h"
+#include "xgboost/base.h"                // for bst_group_t, XGBOOST_DEVICE, GradientPair
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/linalg.h"              // for VectorView, Range, Vector
+#include "xgboost/logging.h"
+#include "xgboost/span.h"                // for Span
+
+namespace xgboost::obj {
+DMLC_REGISTRY_FILE_TAG(lambdarank_obj_cu);
+
+namespace cuda_impl {
+common::Span<std::size_t const> SortY(Context const* ctx, MetaInfo const& info,
+                                      common::Span<std::size_t const> d_rank,
+                                      std::shared_ptr<ltr::RankingCache> p_cache) {
+  auto const d_group_ptr = p_cache->DataGroupPtr(ctx);
+  auto label = info.labels.View(ctx->gpu_id);
+  // The buffer for ranked y is necessary as cub segmented sort accepts only pointer.
+  auto d_y_ranked = p_cache->RankedY(ctx, info.num_row_);
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_y_ranked.size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) {
+                       auto g = dh::SegmentId(d_group_ptr, i);
+                       auto g_label =
+                           label.Slice(linalg::Range(d_group_ptr[g], d_group_ptr[g + 1]), 0);
+                       auto g_rank_idx = d_rank.subspan(d_group_ptr[g], g_label.Size());
+                       i -= d_group_ptr[g];
+                       auto g_y_ranked = d_y_ranked.subspan(d_group_ptr[g], g_label.Size());
+                       g_y_ranked[i] = g_label(g_rank_idx[i]);
+                     });
+  auto d_y_sorted_idx = p_cache->SortedIdxY(ctx, info.num_row_);
+  common::SegmentedArgSort<false, true>(ctx, d_y_ranked, d_group_ptr, d_y_sorted_idx);
+  return d_y_sorted_idx;
+}
+}  // namespace cuda_impl
+}  // namespace xgboost::obj
--- a/src/objective/lambdarank_obj.cuh
+++ b/src/objective/lambdarank_obj.cuh
@@ -0,0 +1,172 @@
+/**
+ * Copyright 2023 XGBoost contributors
+ */
+#ifndef XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_
+#define XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_
+
+#include <thrust/binary_search.h>                      // for lower_bound, upper_bound
+#include <thrust/functional.h>                         // for greater
+#include <thrust/iterator/counting_iterator.h>         // for make_counting_iterator
+#include <thrust/random/linear_congruential_engine.h>  // for minstd_rand
+#include <thrust/random/uniform_int_distribution.h>    // for uniform_int_distribution
+
+#include <cassert>                                     // for cassert
+#include <cstddef>                                     // for size_t
+#include <cstdint>                                     // for int32_t
+#include <tuple>                                       // for make_tuple, tuple
+
+#include "../common/device_helpers.cuh"                // for MakeTransformIterator
+#include "../common/ranking_utils.cuh"                 // for PairsForGroup
+#include "../common/ranking_utils.h"                   // for RankingCache
+#include "../common/threading_utils.cuh"               // for UnravelTrapeziodIdx
+#include "xgboost/base.h"    // for bst_group_t, GradientPair, XGBOOST_DEVICE
+#include "xgboost/data.h"    // for MetaInfo
+#include "xgboost/linalg.h"  // for VectorView, Range, UnravelIndex
+#include "xgboost/span.h"    // for Span
+
+namespace xgboost::obj::cuda_impl {
+/**
+ * \brief Find number of elements left to the label bucket
+ */
+template <typename It, typename T = typename std::iterator_traits<It>::value_type>
+XGBOOST_DEVICE __forceinline__ std::size_t CountNumItemsToTheLeftOf(It items, std::size_t n, T v) {
+  return thrust::lower_bound(thrust::seq, items, items + n, v, thrust::greater<T>{}) - items;
+}
+/**
+ * \brief Find number of elements right to the label bucket
+ */
+template <typename It, typename T = typename std::iterator_traits<It>::value_type>
+XGBOOST_DEVICE __forceinline__ std::size_t CountNumItemsToTheRightOf(It items, std::size_t n, T v) {
+  return n - (thrust::upper_bound(thrust::seq, items, items + n, v, thrust::greater<T>{}) - items);
+}
+/**
+ * \brief Sort labels according to rank list for making pairs.
+ */
+common::Span<std::size_t const> SortY(Context const *ctx, MetaInfo const &info,
+                                      common::Span<std::size_t const> d_rank,
+                                      std::shared_ptr<ltr::RankingCache> p_cache);
+
+/**
+ * \brief Parameters needed for calculating gradient
+ */
+struct KernelInputs {
+  linalg::VectorView<double const> ti_plus;   // input bias ratio
+  linalg::VectorView<double const> tj_minus;  // input bias ratio
+  linalg::VectorView<double> li;
+  linalg::VectorView<double> lj;
+
+  common::Span<bst_group_t const> d_group_ptr;
+  common::Span<std::size_t const> d_threads_group_ptr;
+  common::Span<std::size_t const> d_sorted_idx;
+
+  linalg::MatrixView<float const> labels;
+  common::Span<float const> predts;
+  common::Span<GradientPair> gpairs;
+
+  linalg::VectorView<GradientPair const> d_roundings;
+  double const *d_cost_rounding;
+
+  common::Span<std::size_t const> d_y_sorted_idx;
+
+  std::int32_t iter;
+};
+/**
+ * \brief Functor for generating pairs
+ */
+template <bool has_truncation>
+struct MakePairsOp {
+  KernelInputs args;
+  /**
+   * \brief Make pair for the topk pair method.
+   */
+  XGBOOST_DEVICE std::tuple<std::size_t, std::size_t> WithTruncation(std::size_t idx,
+                                                                     bst_group_t g) const {
+    auto thread_group_begin = args.d_threads_group_ptr[g];
+    auto idx_in_thread_group = idx - thread_group_begin;
+
+    auto data_group_begin = static_cast<std::size_t>(args.d_group_ptr[g]);
+    std::size_t n_data = args.d_group_ptr[g + 1] - data_group_begin;
+    // obtain group segment data.
+    auto g_label = args.labels.Slice(linalg::Range(data_group_begin, data_group_begin + n_data), 0);
+    auto g_sorted_idx = args.d_sorted_idx.subspan(data_group_begin, n_data);
+
+    std::size_t i = 0, j = 0;
+    common::UnravelTrapeziodIdx(idx_in_thread_group, n_data, &i, &j);
+
+    std::size_t rank_high = i, rank_low = j;
+    return std::make_tuple(rank_high, rank_low);
+  }
+  /**
+   * \brief Make pair for the mean pair method
+   */
+  XGBOOST_DEVICE std::tuple<std::size_t, std::size_t> WithSampling(std::size_t idx,
+                                                                   bst_group_t g) const {
+    std::size_t n_samples = args.labels.Size();
+    assert(n_samples == args.predts.size());
+    // Constructed from ranking cache.
+    std::size_t n_pairs =
+        ltr::cuda_impl::PairsForGroup(args.d_threads_group_ptr[g + 1] - args.d_threads_group_ptr[g],
+                                      args.d_group_ptr[g + 1] - args.d_group_ptr[g]);
+
+    assert(n_pairs > 0);
+    auto [sample_idx, sample_pair_idx] = linalg::UnravelIndex(idx, {n_samples, n_pairs});
+
+    auto g_begin = static_cast<std::size_t>(args.d_group_ptr[g]);
+    std::size_t n_data = args.d_group_ptr[g + 1] - g_begin;
+
+    auto g_label = args.labels.Slice(linalg::Range(g_begin, g_begin + n_data));
+    auto g_rank_idx = args.d_sorted_idx.subspan(args.d_group_ptr[g], n_data);
+    auto g_y_sorted_idx = args.d_y_sorted_idx.subspan(g_begin, n_data);
+
+    std::size_t const i = sample_idx - g_begin;
+    assert(sample_pair_idx < n_samples);
+    assert(i <= sample_idx);
+
+    auto g_sorted_label = dh::MakeTransformIterator<float>(
+        thrust::make_counting_iterator(0ul),
+        [&](std::size_t i) { return g_label(g_rank_idx[g_y_sorted_idx[i]]); });
+
+    // Are the labels diverse enough? If they are all the same, then there is nothing to pick
+    // from another group - bail sooner
+    if (g_label.Size() == 0 || g_sorted_label[0] == g_sorted_label[n_data - 1]) {
+      auto z = static_cast<std::size_t>(0ul);
+      return std::make_tuple(z, z);
+    }
+
+    std::size_t n_lefts = CountNumItemsToTheLeftOf(g_sorted_label, i + 1, g_sorted_label[i]);
+    std::size_t n_rights =
+        CountNumItemsToTheRightOf(g_sorted_label + i, n_data - i, g_sorted_label[i]);
+    // The index pointing to the first element of the next bucket
+    std::size_t right_bound = n_data - n_rights;
+
+    thrust::minstd_rand rng(args.iter);
+    auto pair_idx = i;
+    rng.discard(sample_pair_idx * n_data + g + pair_idx);  // fixme
+    thrust::uniform_int_distribution<std::size_t> dist(0, n_lefts + n_rights - 1);
+    auto ridx = dist(rng);
+    SPAN_CHECK(ridx < n_lefts + n_rights);
+    if (ridx >= n_lefts) {
+      ridx = ridx - n_lefts + right_bound;  // fixme
+    }
+
+    auto idx0 = g_y_sorted_idx[pair_idx];
+    auto idx1 = g_y_sorted_idx[ridx];
+
+    return std::make_tuple(idx0, idx1);
+  }
+  /**
+   * \brief Generate a single pair.
+   *
+   * \param idx Pair index (CUDA thread index).
+   * \param g   Query group index.
+   */
+  XGBOOST_DEVICE auto operator()(std::size_t idx, bst_group_t g) const {
+    if (has_truncation) {
+      return this->WithTruncation(idx, g);
+    } else {
+      return this->WithSampling(idx, g);
+    }
+  }
+};
+}  // namespace xgboost::obj::cuda_impl
+#endif  // XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_
--- a/src/objective/lambdarank_obj.h
+++ b/src/objective/lambdarank_obj.h
@@ -0,0 +1,260 @@
+/**
+ * Copyright 2023 XGBoost contributors
+ */
+#ifndef XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
+#define XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
+#include <algorithm>                       // for min, max
+#include <cassert>                         // for assert
+#include <cmath>                           // for log, abs
+#include <cstddef>                         // for size_t
+#include <functional>                      // for greater
+#include <memory>                          // for shared_ptr
+#include <random>                          // for minstd_rand, uniform_int_distribution
+#include <vector>                          // for vector
+
+#include "../common/algorithm.h"           // for ArgSort
+#include "../common/math.h"                // for Sigmoid
+#include "../common/ranking_utils.h"       // for CalcDCGGain
+#include "../common/transform_iterator.h"  // for MakeIndexTransformIter
+#include "xgboost/base.h"                  // for GradientPair, XGBOOST_DEVICE, kRtEps
+#include "xgboost/context.h"               // for Context
+#include "xgboost/data.h"                  // for MetaInfo
+#include "xgboost/host_device_vector.h"    // for HostDeviceVector
+#include "xgboost/linalg.h"                // for VectorView, Vector
+#include "xgboost/logging.h"               // for CHECK_EQ
+#include "xgboost/span.h"                  // for Span
+
+namespace xgboost::obj {
+template <bool exp>
+XGBOOST_DEVICE double DeltaNDCG(float y_high, float y_low, std::size_t r_high, std::size_t r_low,
+                                double inv_IDCG, common::Span<double const> discount) {
+  double gain_high = exp ? ltr::CalcDCGGain(y_high) : y_high;
+  double discount_high = discount[r_high];
+
+  double gain_low = exp ? ltr::CalcDCGGain(y_low) : y_low;
+  double discount_low = discount[r_low];
+
+  double original = gain_high * discount_high + gain_low * discount_low;
+  double changed = gain_low * discount_high + gain_high * discount_low;
+
+  double delta_NDCG = (original - changed) * inv_IDCG;
+  assert(delta_NDCG >= -1.0);
+  assert(delta_NDCG <= 1.0);
+  return delta_NDCG;
+}
+
+XGBOOST_DEVICE inline double DeltaMAP(float y_high, float y_low, std::size_t rank_high,
+                                      std::size_t rank_low, common::Span<double const> n_rel,
+                                      common::Span<double const> acc) {
+  double r_h = static_cast<double>(rank_high) + 1.0;
+  double r_l = static_cast<double>(rank_low) + 1.0;
+  double delta{0.0};
+  double n_total_relevances = n_rel.back();
+  assert(n_total_relevances > 0.0);
+  auto m = n_rel[rank_low];
+  double n = n_rel[rank_high];
+
+  if (y_high < y_low) {
+    auto a = m / r_l - (n + 1.0) / r_h;
+    auto b = acc[rank_low - 1] - acc[rank_high];
+    delta = (a - b) / n_total_relevances;
+  } else {
+    auto a = n / r_h - m / r_l;
+    auto b = acc[rank_low - 1] - acc[rank_high];
+    delta = (a + b) / n_total_relevances;
+  }
+  return delta;
+}
+
+template <bool unbiased, typename Delta>
+XGBOOST_DEVICE GradientPair
+LambdaGrad(linalg::VectorView<float const> labels, common::Span<float const> predts,
+           common::Span<size_t const> sorted_idx,
+           std::size_t rank_high,                     // cordiniate
+           std::size_t rank_low,                      // cordiniate
+           Delta delta,                               // delta score
+           linalg::VectorView<double const> t_plus,   // input bias ratio
+           linalg::VectorView<double const> t_minus,  // input bias ratio
+           double* p_cost) {
+  assert(sorted_idx.size() > 0 && "Empty sorted idx for a group.");
+  std::size_t idx_high = sorted_idx[rank_high];
+  std::size_t idx_low = sorted_idx[rank_low];
+
+  if (labels(idx_high) == labels(idx_low)) {
+    *p_cost = 0;
+    return {0.0f, 0.0f};
+  }
+
+  auto best_score = predts[sorted_idx.front()];
+  auto worst_score = predts[sorted_idx.back()];
+
+  auto y_high = labels(idx_high);
+  float s_high = predts[idx_high];
+  auto y_low = labels(idx_low);
+  float s_low = predts[idx_low];
+
+  // Use double whenever possible as we are working on the exp space.
+  double delta_score = std::abs(s_high - s_low);
+  double sigmoid = common::Sigmoid(s_high - s_low);
+  // Change in metric score like \delta NDCG or \delta MAP
+  double delta_metric = std::abs(delta(y_high, y_low, rank_high, rank_low));
+
+  if (best_score != worst_score) {
+    delta_metric /= (delta_score + kRtEps);
+  }
+
+  if (unbiased) {
+    *p_cost = std::log(1.0 / (1.0 - sigmoid)) * delta_metric;
+  }
+
+  constexpr double kEps = 1e-16;
+  auto lambda_ij = (sigmoid - 1.0) * delta_metric;
+  auto hessian_ij = std::max(sigmoid * (1.0 - sigmoid), kEps) * delta_metric * 2.0;
+
+  auto k = t_plus.Size();
+  assert(t_minus.Size() == k && "Invalid size of position bias");
+
+  if (unbiased && idx_high < k && idx_low < k) {
+    lambda_ij /= (t_minus(idx_low) * t_plus(idx_high) + kRtEps);
+    hessian_ij /= (t_minus(idx_low) * t_plus(idx_high) + kRtEps);
+  }
+
+  auto pg = GradientPair{static_cast<float>(lambda_ij), static_cast<float>(hessian_ij)};
+  return pg;
+}
+
+XGBOOST_DEVICE inline GradientPair Repulse(GradientPair pg) {
+  auto ng = GradientPair{-pg.GetGrad(), pg.GetHess()};
+  return ng;
+}
+
+namespace cuda_impl {
+void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
+                               HostDeviceVector<float> const& preds, MetaInfo const& info,
+                               std::shared_ptr<ltr::NDCGCache> p_cache,
+                               linalg::VectorView<double const> t_plus,   // input bias ratio
+                               linalg::VectorView<double const> t_minus,  // input bias ratio
+                               linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                               HostDeviceVector<GradientPair>* out_gpair);
+
+/**
+ * \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
+ */
+void MAPStat(Context const* ctx, MetaInfo const& info, common::Span<std::size_t const> d_rank_idx,
+             std::shared_ptr<ltr::MAPCache> p_cache);
+
+void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
+                              HostDeviceVector<float> const& predt, MetaInfo const& info,
+                              std::shared_ptr<ltr::MAPCache> p_cache,
+                              linalg::VectorView<double const> t_plus,   // input bias ratio
+                              linalg::VectorView<double const> t_minus,  // input bias ratio
+                              linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                              HostDeviceVector<GradientPair>* out_gpair);
+
+void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
+                                   HostDeviceVector<float> const& predt, const MetaInfo& info,
+                                   std::shared_ptr<ltr::RankingCache> p_cache,
+                                   linalg::VectorView<double const> ti_plus,   // input bias ratio
+                                   linalg::VectorView<double const> tj_minus,  // input bias ratio
+                                   linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                                   HostDeviceVector<GradientPair>* out_gpair);
+
+void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
+                                  linalg::VectorView<double const> lj_full,
+                                  linalg::Vector<double>* p_ti_plus,
+                                  linalg::Vector<double>* p_tj_minus, linalg::Vector<double>* p_li,
+                                  linalg::Vector<double>* p_lj,
+                                  std::shared_ptr<ltr::RankingCache> p_cache);
+}  // namespace cuda_impl
+
+namespace cpu_impl {
+/**
+ * \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
+ *
+ * \param label    Ground truth relevance label.
+ * \param rank_idx Sorted index of prediction.
+ * \param p_cache  An initialized MAPCache.
+ */
+void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
+             common::Span<std::size_t const> rank_idx, std::shared_ptr<ltr::MAPCache> p_cache);
+}  // namespace cpu_impl
+
+/**
+ * \param Construct pairs on CPU
+ *
+ * \tparam Op Functor for upgrading a pair of gradients.
+ *
+ * \param ctx     The global context.
+ * \param iter    The boosting iteration.
+ * \param cache   ltr cache.
+ * \param g       The current query group
+ * \param g_label label The labels for the current query group
+ * \param g_rank  Sorted index of model scores for the current query group.
+ * \param op      A callable that accepts two index for a pair of documents. The index is for
+ *                the ranked list (labels sorted according to model scores).
+ */
+template <typename Op>
+void MakePairs(Context const* ctx, std::int32_t iter,
+               std::shared_ptr<ltr::RankingCache> const cache, bst_group_t g,
+               linalg::VectorView<float const> g_label, common::Span<std::size_t const> g_rank,
+               Op op) {
+  auto group_ptr = cache->DataGroupPtr(ctx);
+  ltr::position_t cnt = group_ptr[g + 1] - group_ptr[g];
+
+  if (cache->Param().HasTruncation()) {
+    for (std::size_t i = 0; i < std::min(cnt, cache->Param().NumPair()); ++i) {
+      for (std::size_t j = i + 1; j < cnt; ++j) {
+        op(i, j);
+      }
+    }
+  } else {
+    CHECK_EQ(g_rank.size(), g_label.Size());
+    std::minstd_rand rnd(iter);
+    rnd.discard(g);  // fixme(jiamingy): honor the global seed
+    // sort label according to the rank list
+    auto it = common::MakeIndexTransformIter(
+        [&g_rank, &g_label](std::size_t idx) { return g_label(g_rank[idx]); });
+    std::vector<std::size_t> y_sorted_idx =
+        common::ArgSort<std::size_t>(ctx, it, it + cnt, std::greater<>{});
+    // permutation iterator to get the original label
+    auto rev_it = common::MakeIndexTransformIter(
+        [&](std::size_t idx) { return g_label(g_rank[y_sorted_idx[idx]]); });
+
+    for (std::size_t i = 0; i < cnt;) {
+      std::size_t j = i + 1;
+      // find the bucket boundary
+      while (j < cnt && rev_it[i] == rev_it[j]) {
+        ++j;
+      }
+      // Bucket [i,j), construct n_samples pairs for each sample inside the bucket with
+      // another sample outside the bucket.
+      //
+      // n elements left to the bucket, and n elements right to the bucket
+      std::size_t n_lefts = i, n_rights = static_cast<std::size_t>(cnt - j);
+      if (n_lefts + n_rights == 0) {
+        i = j;
+        continue;
+      }
+
+      auto n_samples = cache->Param().NumPair();
+      // for each pair specifed by the user
+      while (n_samples--) {
+        // for each sample in the bucket
+        for (std::size_t pair_idx = i; pair_idx < j; ++pair_idx) {
+          std::size_t ridx = std::uniform_int_distribution<std::size_t>(
+              static_cast<std::size_t>(0), n_lefts + n_rights - 1)(rnd);
+          if (ridx >= n_lefts) {
+            ridx = ridx - i + j;  // shift to the right of the bucket
+          }
+          // index that points to the rank list.
+          auto idx0 = y_sorted_idx[pair_idx];
+          auto idx1 = y_sorted_idx[ridx];
+          op(idx0, idx1);
+        }
+      }
+      i = j;
+    }
+  }
+}
+}  // namespace xgboost::obj
+#endif  // XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -35,7 +35,10 @@ class QuantileRegression : public ObjFunction {
  bst_target_t Targets(MetaInfo const& info) const override {
    auto const& alpha = param_.quantile_alpha.Get();
    CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
-    CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target is not yet supported by the quantile loss.";
+    if (!info.IsVerticalFederated() || collective::GetRank() == 0) {
+      CHECK_EQ(info.labels.Shape(1), 1)
+          << "Multi-target is not yet supported by the quantile loss.";
+    }
    CHECK(!alpha.empty());
    // We have some placeholders for multi-target in the quantile loss. But it's not
    // supported as the gbtree doesn't know how to slice the gradient and there's no 3-dim
@@ -167,8 +170,10 @@ class QuantileRegression : public ObjFunction {
    common::Mean(ctx_, *base_score, &temp);
    double meanq = temp(0) * sw;

-    collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
-    collective::Allreduce<collective::Operation::kSum>(&sw, 1);
+    if (info.IsRowSplit()) {
+      collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
+      collective::Allreduce<collective::Operation::kSum>(&sw, 1);
+    }
    meanq /= (sw + kRtEps);
    base_score->Reshape(1);
    base_score->Data()->Fill(meanq);
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -728,8 +728,10 @@ class MeanAbsoluteError : public ObjFunction {
    std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
                   [w](float v) { return v * w; });

-    collective::Allreduce<collective::Operation::kSum>(out.Values().data(), out.Values().size());
-    collective::Allreduce<collective::Operation::kSum>(&w, 1);
+    if (info.IsRowSplit()) {
+      collective::Allreduce<collective::Operation::kSum>(out.Values().data(), out.Values().size());
+      collective::Allreduce<collective::Operation::kSum>(&w, 1);
+    }

    if (common::CloseTo(w, 0.0)) {
      // Mostly for handling empty dataset test.