initial merge

2023-03-25 04:31:55 +01:00
parent d97be6f396 cff50fe3ef
commit 7fbc561e17
146 changed files with 6730 additions and 4082 deletions
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -14,7 +14,7 @@

 // clang with libstdc++ works as well
 #if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
-    !defined(__APPLE__) && __has_include(<omp.h>)
+    !defined(__APPLE__) && __has_include(<omp.h>) && __has_include(<parallel/algorithm>)
 #define GCC_HAS_PARALLEL 1
 #endif  // GLIC_VERSION

--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -121,17 +121,20 @@ namespace dh {
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)

-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
-                                     int line) {
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
  if (code != ncclSuccess) {
    std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
+    ss << " " << file << "(" << line << ")\n";
    if (code == ncclUnhandledCudaError) {
      // nccl usually preserves the last error so we can get more details.
      auto err = cudaPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+    } else if (code == ncclSystemError) {
+      ss << "  This might be caused by a network configuration issue. Please consider specifying "
+            "the network interface for NCCL via environment variables listed in its reference: "
+            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
    }
-    ss << " " << file << "(" << line << ")";
    LOG(FATAL) << ss.str();
  }

--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
@@ -2,6 +2,9 @@
 * Copyright 2017-2023 XGBoost contributors
 */
 #pragma once
+
+#if defined(XGBOOST_USE_CUDA)
+
 #include <thrust/binary_search.h>  // thrust::upper_bound
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_ptr.h>
@@ -95,20 +98,23 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
 }
 namespace dh {

-#ifdef XGBOOST_USE_NCCL
+#ifdef XGBOOST_USE_RCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)

-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
-                                     int line) {
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
  if (code != ncclSuccess) {
    std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "RCCL failure: " << ncclGetErrorString(code) << ".";
+    ss << " " << file << "(" << line << ")\n";
    if (code == ncclUnhandledCudaError) {
      // nccl usually preserves the last error so we can get more details.
      auto err = hipPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::hip_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+    } else if (code == ncclSystemError) {
+      ss << "  This might be caused by a network configuration issue. Please consider specifying "
+            "the network interface for NCCL via environment variables listed in its reference: "
+            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
    }
-    ss << " " << file << "(" << line << ")";
    LOG(FATAL) << ss.str();
  }

--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -20,5 +20,9 @@ constexpr StringView GroupSize() {
 constexpr StringView LabelScoreSize() {
  return "The size of label doesn't match the size of prediction.";
 }
+
+constexpr StringView InfInData() {
+  return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -7,23 +7,22 @@
 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
 #define XGBOOST_COMMON_HIST_UTIL_H_

-#include <xgboost/data.h>
-
 #include <algorithm>
+#include <cstdint>  // for uint32_t
 #include <limits>
 #include <map>
 #include <memory>
 #include <utility>
 #include <vector>

-#include "algorithm.h"  // SegmentId
 #include "categorical.h"
 #include "common.h"
 #include "quantile.h"
 #include "row_set.h"
 #include "threading_utils.h"
 #include "timer.h"
-#include "xgboost/base.h"  // bst_feature_t, bst_bin_t
+#include "xgboost/base.h"  // for bst_feature_t, bst_bin_t
+#include "xgboost/data.h"

 namespace xgboost {
 class GHistIndexMatrix;
@@ -392,15 +391,18 @@ class HistCollection {
  }

  // have we computed a histogram for i-th node?
-  bool RowExists(bst_uint nid) const {
+  [[nodiscard]] bool RowExists(bst_uint nid) const {
    const uint32_t k_max = std::numeric_limits<uint32_t>::max();
    return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
  }
-
-  // initialize histogram collection
-  void Init(uint32_t nbins) {
-    if (nbins_ != nbins) {
-      nbins_ = nbins;
+  /**
+   * \brief Initialize histogram collection.
+   *
+   * \param n_total_bins Number of bins across all features.
+   */
+  void Init(std::uint32_t n_total_bins) {
+    if (nbins_ != n_total_bins) {
+      nbins_ = n_total_bins;
      // quite expensive operation, so let's do this only once
      data_.clear();
    }
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -333,7 +333,7 @@ size_t constexpr JsonReader::kMaxNumLength;
 Json JsonReader::Parse() {
  while (true) {
    SkipSpaces();
-    char c = PeekNextChar();
+    auto c = PeekNextChar();
    if (c == -1) { break; }

    if (c == '{') {
@@ -408,13 +408,13 @@ void JsonReader::Error(std::string msg) const {
 }

 namespace {
-bool IsSpace(char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
+bool IsSpace(JsonReader::Char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
 }  // anonymous namespace

 // Json class
 void JsonReader::SkipSpaces() {
  while (cursor_.Pos() < raw_str_.size()) {
-    char c = raw_str_[cursor_.Pos()];
+    Char c = raw_str_[cursor_.Pos()];
    if (IsSpace(c)) {
      cursor_.Forward();
    } else {
@@ -436,12 +436,12 @@ void ParseStr(std::string const& str) {
 }

 Json JsonReader::ParseString() {
-  char ch { GetConsecutiveChar('\"') };  // NOLINT
+  Char ch { GetConsecutiveChar('\"') };  // NOLINT
  std::string str;
  while (true) {
    ch = GetNextChar();
    if (ch == '\\') {
-      char next = static_cast<char>(GetNextChar());
+      Char next{GetNextChar()};
      switch (next) {
        case 'r':  str += u8"\r"; break;
        case 'n':  str += u8"\n"; break;
@@ -466,8 +466,8 @@ Json JsonReader::ParseString() {
 }

 Json JsonReader::ParseNull() {
-  char ch = GetNextNonSpaceChar();
-  std::string buffer{ch};
+  Char ch = GetNextNonSpaceChar();
+  std::string buffer{static_cast<char>(ch)};
  for (size_t i = 0; i < 3; ++i) {
    buffer.push_back(GetNextChar());
  }
@@ -480,7 +480,7 @@ Json JsonReader::ParseNull() {
 Json JsonReader::ParseArray() {
  std::vector<Json> data;

-  char ch { GetConsecutiveChar('[') };  // NOLINT
+  Char ch { GetConsecutiveChar('[') };  // NOLINT
  while (true) {
    if (PeekNextChar() == ']') {
      GetConsecutiveChar(']');
@@ -503,7 +503,7 @@ Json JsonReader::ParseObject() {

  Object::Map data;
  SkipSpaces();
-  char ch = PeekNextChar();
+  auto ch = PeekNextChar();

  if (ch == '}') {
    GetConsecutiveChar('}');
@@ -652,7 +652,7 @@ Json JsonReader::ParseNumber() {

 Json JsonReader::ParseBoolean() {
  bool result = false;
-  char ch = GetNextNonSpaceChar();
+  Char ch = GetNextNonSpaceChar();
  std::string const t_value = u8"true";
  std::string const f_value = u8"false";

@@ -737,7 +737,8 @@ Json UBJReader::ParseArray() {
      case 'L':
        return ParseTypedArray<I64Array>(n);
      default:
-        LOG(FATAL) << "`" + std::string{type} + "` is not supported for typed array.";  // NOLINT
+        LOG(FATAL) << "`" + std::string{static_cast<char>(type)} +  // NOLINT
+                          "` is not supported for typed array.";
    }
  }
  std::vector<Json> results;
@@ -794,7 +795,7 @@ Json UBJReader::Load() {

 Json UBJReader::Parse() {
  while (true) {
-    char c = PeekNextChar();
+    auto c = PeekNextChar();
    if (c == -1) {
      break;
    }
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -1,13 +1,15 @@
-/*!
- * Copyright 2022, XGBoost contributors.
+/**
+ * Copyright 2022-2023 by XGBoost contributors.
 */
 #ifndef XGBOOST_COMMON_NUMERIC_H_
 #define XGBOOST_COMMON_NUMERIC_H_

 #include <dmlc/common.h>  // OMPException

-#include <algorithm>  // std::max
-#include <iterator>   // std::iterator_traits
+#include <algorithm>  // for std::max
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t
+#include <iterator>   // for iterator_traits
 #include <vector>

 #include "common.h"                      // AssertGPUSupport
@@ -15,8 +17,7 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector

-namespace xgboost {
-namespace common {
+namespace xgboost::common {

 /**
 * \brief Run length encode on CPU, input must be sorted.
@@ -111,11 +112,11 @@ inline double Reduce(Context const*, HostDeviceVector<float> const&) {
 namespace cpu_impl {
 template <typename It, typename V = typename It::value_type>
 V Reduce(Context const* ctx, It first, It second, V const& init) {
-  size_t n = std::distance(first, second);
-  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(ctx->Threads(), init);
-  common::ParallelFor(n, ctx->Threads(),
-                      [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
-  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init);
+  std::size_t n = std::distance(first, second);
+  auto n_threads = static_cast<std::size_t>(std::min(n, static_cast<std::size_t>(ctx->Threads())));
+  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(n_threads, init);
+  common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
+  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init);
  return result;
 }
 }  // namespace cpu_impl
@@ -144,7 +145,6 @@ void Iota(Context const* ctx, It first, It last,
    });
  }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common

 #endif  // XGBOOST_COMMON_NUMERIC_H_
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -1,391 +1,386 @@
-/*!
- * Copyright 2021-2022 by Contributors
- * \file row_set.h
- * \brief Quick Utility to compute subset of rows
- * \author Philip Cho, Tianqi Chen
- */
-#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
-#define XGBOOST_COMMON_PARTITION_BUILDER_H_
-
-#include <xgboost/data.h>
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "../tree/hist/expand_entry.h"
-#include "categorical.h"
-#include "column_matrix.h"
-#include "xgboost/context.h"
-#include "xgboost/tree_model.h"
-
-namespace xgboost {
-namespace common {
-
-// The builder is required for samples partition to left and rights children for set of nodes
-// Responsible for:
-// 1) Effective memory allocation for intermediate results for multi-thread work
-// 2) Merging partial results produced by threads into original row set (row_set_collection_)
-// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
-template<size_t BlockSize>
-class PartitionBuilder {
-  using BitVector = RBitField8;
-
- public:
-  template<typename Func>
-  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
-    left_right_nodes_sizes_.resize(n_nodes);
-    blocks_offsets_.resize(n_nodes+1);
-
-    blocks_offsets_[0] = 0;
-    for (size_t i = 1; i < n_nodes+1; ++i) {
-      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
-    }
-
-    if (n_tasks > max_n_tasks_) {
-      mem_blocks_.resize(n_tasks);
-      max_n_tasks_ = n_tasks;
-    }
-  }
-
-  // split row indexes (rid_span) to 2 parts (left_part, right_part) depending
-  // on comparison of indexes values (idx_span) and split point (split_cond)
-  // Handle dense columns
-  // Analog of std::stable_partition, but in no-inplace manner
-  template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
-  inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
-                                                   common::Span<const size_t> row_indices,
-                                                   common::Span<size_t> left_part,
-                                                   common::Span<size_t> right_part,
-                                                   size_t base_rowid, Predicate&& pred) {
-    auto& column = *p_column;
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
-
-    auto p_row_indices = row_indices.data();
-    auto n_samples = row_indices.size();
-
-    for (size_t i = 0; i < n_samples; ++i) {
-      auto rid = p_row_indices[i];
-      const int32_t bin_id = column[rid - base_rowid];
-      if (any_missing && bin_id == ColumnType::kMissingId) {
-        if (default_left) {
-          p_left_part[nleft_elems++] = rid;
-        } else {
-          p_right_part[nright_elems++] = rid;
-        }
-      } else {
-        if (pred(rid, bin_id)) {
-          p_left_part[nleft_elems++] = rid;
-        } else {
-          p_right_part[nright_elems++] = rid;
-        }
-      }
-    }
-
-    return {nleft_elems, nright_elems};
-  }
-
-  template <typename Pred>
-  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
-                                                        common::Span<size_t> left_part,
-                                                        common::Span<size_t> right_part,
-                                                        Pred pred) {
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
-    for (auto row_id : ridx) {
-      if (pred(row_id)) {
-        p_left_part[nleft_elems++] = row_id;
-      } else {
-        p_right_part[nright_elems++] = row_id;
-      }
-    }
-    return {nleft_elems, nright_elems};
-  }
-
-  template <typename BinIdxType, bool any_missing, bool any_cat>
-  void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
-                 const common::Range1d range,
-                 const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
-                 const common::ColumnMatrix& column_matrix,
-                 const RegTree& tree, const size_t* rid) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
-    bool default_left = tree[nid].DefaultLeft();
-    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
-    auto node_cats = tree.NodeCats(nid);
-    auto const& cut_values = gmat.cut.Values();
-
-    auto pred_hist = [&](auto ridx, auto bin_id) {
-      if (any_cat && is_cat) {
-        auto gidx = gmat.GetGindex(ridx, fid);
-        bool go_left = default_left;
-        if (gidx > -1) {
-          go_left = Decision(node_cats, cut_values[gidx]);
-        }
-        return go_left;
-      } else {
-        return bin_id <= split_cond;
-      }
-    };
-
-    auto pred_approx = [&](auto ridx) {
-      auto gidx = gmat.GetGindex(ridx, fid);
-      bool go_left = default_left;
-      if (gidx > -1) {
-        if (is_cat) {
-          go_left = Decision(node_cats, cut_values[gidx]);
-        } else {
-          go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
-        }
-      }
-      return go_left;
-    };
-
-    std::pair<size_t, size_t> child_nodes_sizes;
-    if (!column_matrix.IsInitialized()) {
-      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
-    } else {
-      if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
-        auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
-        if (default_left) {
-          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
-                                                                 gmat.base_rowid, pred_hist);
-        } else {
-          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
-                                                                  gmat.base_rowid, pred_hist);
-        }
-      } else {
-        CHECK_EQ(any_missing, true);
-        auto column =
-            column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
-        if (default_left) {
-          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
-                                                                 gmat.base_rowid, pred_hist);
-        } else {
-          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
-                                                                  gmat.base_rowid, pred_hist);
-        }
-      }
-    }
-
-    const size_t n_left  = child_nodes_sizes.first;
-    const size_t n_right = child_nodes_sizes.second;
-
-    SetNLeftElems(node_in_set, range.begin(), n_left);
-    SetNRightElems(node_in_set, range.begin(), n_right);
-  }
-
-  /**
-   * @brief When data is split by column, we don't have all the features locally on the current
-   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
-   * to go right, or if the feature value used for the split is missing.
-   */
-  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
-                const common::Range1d range, GHistIndexMatrix const& gmat,
-                const common::ColumnMatrix& column_matrix,
-                const RegTree& tree, const size_t* rid,
-                BitVector* decision_bits, BitVector* missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
-    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
-    auto node_cats = tree.NodeCats(nid);
-    auto const& cut_values = gmat.cut.Values();
-
-    if (!column_matrix.IsInitialized()) {
-      for (auto row_id : rid_span) {
-        auto gidx = gmat.GetGindex(row_id, fid);
-        if (gidx > -1) {
-          bool go_left = false;
-          if (is_cat) {
-            go_left = Decision(node_cats, cut_values[gidx]);
-          } else {
-            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
-          }
-          if (go_left) {
-            decision_bits->Set(row_id - gmat.base_rowid);
-          }
-        } else {
-          missing_bits->Set(row_id - gmat.base_rowid);
-        }
-      }
-    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
-    }
-  }
-
-  /**
-   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
-   * use them to partition the rows accordingly.
-   */
-  void PartitionByMask(const size_t node_in_set,
-                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
-                       const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
-                       const size_t* rid, BitVector const& decision_bits,
-                       BitVector const& missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bool default_left = tree[nid].DefaultLeft();
-
-    auto pred_approx = [&](auto ridx) {
-      bool go_left = default_left;
-      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
-      if (!is_missing) {
-        go_left = decision_bits.Check(ridx - gmat.base_rowid);
-      }
-      return go_left;
-    };
-
-    std::pair<size_t, size_t> child_nodes_sizes;
-    if (!column_matrix.IsInitialized()) {
-      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
-    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
-    }
-
-    const size_t n_left  = child_nodes_sizes.first;
-    const size_t n_right = child_nodes_sizes.second;
-
-    SetNLeftElems(node_in_set, range.begin(), n_left);
-    SetNRightElems(node_in_set, range.begin(), n_right);
-  }
-
-  // allocate thread local memory, should be called for each specific task
-  void AllocateForTask(size_t id) {
-    if (mem_blocks_[id].get() == nullptr) {
-      BlockInfo* local_block_ptr = new BlockInfo;
-      CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
-      mem_blocks_[id].reset(local_block_ptr);
-    }
-  }
-
-  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
-    const size_t task_idx = GetTaskIdx(nid, begin);
-    return { mem_blocks_.at(task_idx)->Left(), end - begin };
-  }
-
-  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
-    const size_t task_idx = GetTaskIdx(nid, begin);
-    return { mem_blocks_.at(task_idx)->Right(), end - begin };
-  }
-
-  void SetNLeftElems(int nid, size_t begin, size_t n_left) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-    mem_blocks_.at(task_idx)->n_left = n_left;
-  }
-
-  void SetNRightElems(int nid, size_t begin, size_t n_right) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-    mem_blocks_.at(task_idx)->n_right = n_right;
-  }
-
-
-  size_t GetNLeftElems(int nid) const {
-    return left_right_nodes_sizes_[nid].first;
-  }
-
-  size_t GetNRightElems(int nid) const {
-    return left_right_nodes_sizes_[nid].second;
-  }
-
-  // Each thread has partial results for some set of tree-nodes
-  // The function decides order of merging partial results into final row set
-  void CalculateRowOffsets() {
-    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
-      size_t n_left = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
-        mem_blocks_[j]->n_offset_left = n_left;
-        n_left += mem_blocks_[j]->n_left;
-      }
-      size_t n_right = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
-        mem_blocks_[j]->n_offset_right = n_left + n_right;
-        n_right += mem_blocks_[j]->n_right;
-      }
-      left_right_nodes_sizes_[i] = {n_left, n_right};
-    }
-  }
-
-  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-
-    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
-    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
-
-    const size_t* left = mem_blocks_[task_idx]->Left();
-    const size_t* right = mem_blocks_[task_idx]->Right();
-
-    std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
-    std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
-  }
-
-  size_t GetTaskIdx(int nid, size_t begin) {
-    return blocks_offsets_[nid] + begin / BlockSize;
-  }
-
-  // Copy row partitions into global cache for reuse in objective
-  template <typename Sampledp>
-  void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
-                     std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
-    auto& h_pos = *p_position;
-    h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
-
-    auto p_begin = row_set.Data()->data();
-    ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
-      auto const& node = row_set[i];
-      if (node.node_id < 0) {
-        return;
-      }
-      CHECK(tree[node.node_id].IsLeaf());
-      if (node.begin) {  // guard for empty node.
-        size_t ptr_offset = node.end - p_begin;
-        CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
-        for (auto idx = node.begin; idx != node.end; ++idx) {
-          h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
-        }
-      }
-    });
-  }
-
- protected:
-  struct BlockInfo{
-    size_t n_left;
-    size_t n_right;
-
-    size_t n_offset_left;
-    size_t n_offset_right;
-
-    size_t* Left() {
-      return &left_data_[0];
-    }
-
-    size_t* Right() {
-      return &right_data_[0];
-    }
-   private:
-    size_t left_data_[BlockSize];
-    size_t right_data_[BlockSize];
-  };
-  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
-  std::vector<size_t> blocks_offsets_;
-  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
-  size_t max_n_tasks_ = 0;
-};
-
-}  // namespace common
-}  // namespace xgboost
-
-#endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
+/**
+ * Copyright 2021-2023 by Contributors
+ * \file row_set.h
+ * \brief Quick Utility to compute subset of rows
+ * \author Philip Cho, Tianqi Chen
+ */
+#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
+#define XGBOOST_COMMON_PARTITION_BUILDER_H_
+
+#include <xgboost/data.h>
+
+#include <algorithm>
+#include <cstddef>  // for size_t
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "../tree/hist/expand_entry.h"
+#include "categorical.h"
+#include "column_matrix.h"
+#include "xgboost/context.h"
+#include "xgboost/tree_model.h"
+
+namespace xgboost::common {
+// The builder is required for samples partition to left and rights children for set of nodes
+// Responsible for:
+// 1) Effective memory allocation for intermediate results for multi-thread work
+// 2) Merging partial results produced by threads into original row set (row_set_collection_)
+// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
+template<size_t BlockSize>
+class PartitionBuilder {
+  using BitVector = RBitField8;
+
+ public:
+  template<typename Func>
+  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
+    left_right_nodes_sizes_.resize(n_nodes);
+    blocks_offsets_.resize(n_nodes+1);
+
+    blocks_offsets_[0] = 0;
+    for (size_t i = 1; i < n_nodes+1; ++i) {
+      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
+    }
+
+    if (n_tasks > max_n_tasks_) {
+      mem_blocks_.resize(n_tasks);
+      max_n_tasks_ = n_tasks;
+    }
+  }
+
+  // split row indexes (rid_span) to 2 parts (left_part, right_part) depending
+  // on comparison of indexes values (idx_span) and split point (split_cond)
+  // Handle dense columns
+  // Analog of std::stable_partition, but in no-inplace manner
+  template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
+  inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
+                                                   common::Span<const size_t> row_indices,
+                                                   common::Span<size_t> left_part,
+                                                   common::Span<size_t> right_part,
+                                                   size_t base_rowid, Predicate&& pred) {
+    auto& column = *p_column;
+    size_t* p_left_part = left_part.data();
+    size_t* p_right_part = right_part.data();
+    size_t nleft_elems = 0;
+    size_t nright_elems = 0;
+
+    auto p_row_indices = row_indices.data();
+    auto n_samples = row_indices.size();
+
+    for (size_t i = 0; i < n_samples; ++i) {
+      auto rid = p_row_indices[i];
+      const int32_t bin_id = column[rid - base_rowid];
+      if (any_missing && bin_id == ColumnType::kMissingId) {
+        if (default_left) {
+          p_left_part[nleft_elems++] = rid;
+        } else {
+          p_right_part[nright_elems++] = rid;
+        }
+      } else {
+        if (pred(rid, bin_id)) {
+          p_left_part[nleft_elems++] = rid;
+        } else {
+          p_right_part[nright_elems++] = rid;
+        }
+      }
+    }
+
+    return {nleft_elems, nright_elems};
+  }
+
+  template <typename Pred>
+  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
+                                                        common::Span<size_t> left_part,
+                                                        common::Span<size_t> right_part,
+                                                        Pred pred) {
+    size_t* p_left_part = left_part.data();
+    size_t* p_right_part = right_part.data();
+    size_t nleft_elems = 0;
+    size_t nright_elems = 0;
+    for (auto row_id : ridx) {
+      if (pred(row_id)) {
+        p_left_part[nleft_elems++] = row_id;
+      } else {
+        p_right_part[nright_elems++] = row_id;
+      }
+    }
+    return {nleft_elems, nright_elems};
+  }
+
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
+  void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
+                 const common::Range1d range, const bst_bin_t split_cond,
+                 GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
+                 const RegTree& tree, const size_t* rid) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree.SplitIndex(nid);
+    bool default_left = tree.DefaultLeft(nid);
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    auto pred_hist = [&](auto ridx, auto bin_id) {
+      if (any_cat && is_cat) {
+        auto gidx = gmat.GetGindex(ridx, fid);
+        bool go_left = default_left;
+        if (gidx > -1) {
+          go_left = Decision(node_cats, cut_values[gidx]);
+        }
+        return go_left;
+      } else {
+        return bin_id <= split_cond;
+      }
+    };
+
+    auto pred_approx = [&](auto ridx) {
+      auto gidx = gmat.GetGindex(ridx, fid);
+      bool go_left = default_left;
+      if (gidx > -1) {
+        if (is_cat) {
+          go_left = Decision(node_cats, cut_values[gidx]);
+        } else {
+          go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+        }
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
+        auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
+        if (default_left) {
+          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
+                                                                 gmat.base_rowid, pred_hist);
+        } else {
+          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
+                                                                  gmat.base_rowid, pred_hist);
+        }
+      } else {
+        CHECK_EQ(any_missing, true);
+        auto column =
+            column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
+        if (default_left) {
+          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
+                                                                 gmat.base_rowid, pred_hist);
+        } else {
+          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
+                                                                  gmat.base_rowid, pred_hist);
+        }
+      }
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
+  /**
+   * @brief When data is split by column, we don't have all the features locally on the current
+   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
+   * to go right, or if the feature value used for the split is missing.
+   */
+  template <typename ExpandEntry>
+  void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
+                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
+                BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree[nid].SplitIndex();
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    if (!column_matrix.IsInitialized()) {
+      for (auto row_id : rid_span) {
+        auto gidx = gmat.GetGindex(row_id, fid);
+        if (gidx > -1) {
+          bool go_left = false;
+          if (is_cat) {
+            go_left = Decision(node_cats, cut_values[gidx]);
+          } else {
+            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+          }
+          if (go_left) {
+            decision_bits->Set(row_id - gmat.base_rowid);
+          }
+        } else {
+          missing_bits->Set(row_id - gmat.base_rowid);
+        }
+      }
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+  }
+
+  /**
+   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
+   * use them to partition the rows accordingly.
+   */
+  template <typename ExpandEntry>
+  void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
+                       const common::Range1d range, GHistIndexMatrix const& gmat,
+                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                       const size_t* rid, BitVector const& decision_bits,
+                       BitVector const& missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bool default_left = tree[nid].DefaultLeft();
+
+    auto pred_approx = [&](auto ridx) {
+      bool go_left = default_left;
+      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
+      if (!is_missing) {
+        go_left = decision_bits.Check(ridx - gmat.base_rowid);
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
+  // allocate thread local memory, should be called for each specific task
+  void AllocateForTask(size_t id) {
+    if (mem_blocks_[id].get() == nullptr) {
+      BlockInfo* local_block_ptr = new BlockInfo;
+      CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
+      mem_blocks_[id].reset(local_block_ptr);
+    }
+  }
+
+  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx)->Left(), end - begin };
+  }
+
+  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx)->Right(), end - begin };
+  }
+
+  void SetNLeftElems(int nid, size_t begin, size_t n_left) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx)->n_left = n_left;
+  }
+
+  void SetNRightElems(int nid, size_t begin, size_t n_right) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx)->n_right = n_right;
+  }
+
+
+  [[nodiscard]] std::size_t GetNLeftElems(int nid) const {
+    return left_right_nodes_sizes_[nid].first;
+  }
+
+  [[nodiscard]] std::size_t GetNRightElems(int nid) const {
+    return left_right_nodes_sizes_[nid].second;
+  }
+
+  // Each thread has partial results for some set of tree-nodes
+  // The function decides order of merging partial results into final row set
+  void CalculateRowOffsets() {
+    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
+      size_t n_left = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
+        mem_blocks_[j]->n_offset_left = n_left;
+        n_left += mem_blocks_[j]->n_left;
+      }
+      size_t n_right = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
+        mem_blocks_[j]->n_offset_right = n_left + n_right;
+        n_right += mem_blocks_[j]->n_right;
+      }
+      left_right_nodes_sizes_[i] = {n_left, n_right};
+    }
+  }
+
+  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+
+    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
+    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
+
+    const size_t* left = mem_blocks_[task_idx]->Left();
+    const size_t* right = mem_blocks_[task_idx]->Right();
+
+    std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
+    std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
+  }
+
+  size_t GetTaskIdx(int nid, size_t begin) {
+    return blocks_offsets_[nid] + begin / BlockSize;
+  }
+
+  // Copy row partitions into global cache for reuse in objective
+  template <typename Sampledp>
+  void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
+                     std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
+    auto& h_pos = *p_position;
+    h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
+
+    auto p_begin = row_set.Data()->data();
+    ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
+      auto const& node = row_set[i];
+      if (node.node_id < 0) {
+        return;
+      }
+      CHECK(tree.IsLeaf(node.node_id));
+      if (node.begin) {  // guard for empty node.
+        size_t ptr_offset = node.end - p_begin;
+        CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
+        for (auto idx = node.begin; idx != node.end; ++idx) {
+          h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
+        }
+      }
+    });
+  }
+
+ protected:
+  struct BlockInfo{
+    size_t n_left;
+    size_t n_right;
+
+    size_t n_offset_left;
+    size_t n_offset_right;
+
+    size_t* Left() {
+      return &left_data_[0];
+    }
+
+    size_t* Right() {
+      return &right_data_[0];
+    }
+   private:
+    size_t left_data_[BlockSize];
+    size_t right_data_[BlockSize];
+  };
+  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
+  std::vector<size_t> blocks_offsets_;
+  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
+  size_t max_n_tasks_ = 0;
+};
+}  // namespace xgboost::common
+#endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -359,6 +359,7 @@ void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_b
                 HistogramCuts *cuts) {
  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
  auto &cut_values = cuts->cut_values_.HostVector();
+  // we use the min_value as the first (0th) element, hence starting from 1.
  for (size_t i = 1; i < required_cuts; ++i) {
    bst_float cpt = summary.data[i].value;
    if (i == 1 || cpt > cut_values.back()) {
@@ -419,8 +420,8 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
    } else {
      AddCutPoint<WQSketch>(a, max_num_bins, cuts);
      // push a value that is greater than anything
-      const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
-                                         : cuts->min_vals_.HostVector()[fid];
+      const bst_float cpt =
+          (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
      // this must be bigger than last value in a scale
      const bst_float last = cpt + (fabs(cpt) + 1e-5f);
      cuts->cut_values_.HostVector().push_back(last);
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -352,19 +352,6 @@ struct WQSummary {
      prev_rmax = data[i].rmax;
    }
  }
-  // check consistency of the summary
-  inline bool Check(const char *msg) const {
-    const float tol = 10.0f;
-    for (size_t i = 0; i < this->size; ++i) {
-      if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
-          data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
-        LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
-        this->Print();
-        return false;
-      }
-    }
-    return true;
-  }
 };

 /*! \brief try to do efficient pruning */
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@@ -6,9 +6,7 @@
 #include <algorithm>          // for copy_n, max, min, none_of, all_of
 #include <cstddef>            // for size_t
 #include <cstdio>             // for sscanf
-#include <exception>          // for exception
 #include <functional>         // for greater
-#include <iterator>           // for reverse_iterator
 #include <string>             // for char_traits, string

 #include "algorithm.h"        // for ArgSort
@@ -18,12 +16,113 @@
 #include "xgboost/base.h"     // for bst_group_t
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"     // for MetaInfo
-#include "xgboost/linalg.h"   // for All, TensorView, Range, Tensor, Vector
-#include "xgboost/logging.h"  // for Error, LogCheck_EQ, CHECK_EQ
+#include "xgboost/linalg.h"   // for All, TensorView, Range
+#include "xgboost/logging.h"  // for CHECK_EQ

 namespace xgboost::ltr {
+void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
+  if (info.group_ptr_.empty()) {
+    group_ptr_.Resize(2, 0);
+    group_ptr_.HostVector()[1] = info.num_row_;
+  } else {
+    group_ptr_.HostVector() = info.group_ptr_;
+  }
+
+  auto const& gptr = group_ptr_.ConstHostVector();
+  for (std::size_t i = 1; i < gptr.size(); ++i) {
+    std::size_t n = gptr[i] - gptr[i - 1];
+    max_group_size_ = std::max(max_group_size_, n);
+  }
+
+  double sum_weights = 0;
+  auto n_groups = Groups();
+  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
+  for (bst_omp_uint k = 0; k < n_groups; ++k) {
+    sum_weights += weight[k];
+  }
+  weight_norm_ = static_cast<double>(n_groups) / sum_weights;
+}
+
+common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
+                                                            common::Span<float const> predt) {
+  auto gptr = this->DataGroupPtr(ctx);
+  auto rank = this->sorted_idx_cache_.HostSpan();
+  CHECK_EQ(rank.size(), predt.size());
+
+  common::ParallelFor(this->Groups(), ctx->Threads(), [&](auto g) {
+    auto cnt = gptr[g + 1] - gptr[g];
+    auto g_predt = predt.subspan(gptr[g], cnt);
+    auto g_rank = rank.subspan(gptr[g], cnt);
+    auto sorted_idx = common::ArgSort<std::size_t>(
+        ctx, g_predt.data(), g_predt.data() + g_predt.size(), std::greater<>{});
+    CHECK_EQ(g_rank.size(), sorted_idx.size());
+    std::copy_n(sorted_idx.data(), sorted_idx.size(), g_rank.data());
+  });
+
+  return rank;
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
+                                                             common::Span<float const>) {
+  common::AssertGPUSupport();
+  return {};
+}
+#endif  // !defined()
+
+void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
+  auto const h_group_ptr = this->DataGroupPtr(ctx);
+
+  discounts_.Resize(MaxGroupSize(), 0);
+  auto& h_discounts = discounts_.HostVector();
+  for (std::size_t i = 0; i < MaxGroupSize(); ++i) {
+    h_discounts[i] = CalcDCGDiscount(i);
+  }
+
+  auto n_groups = h_group_ptr.size() - 1;
+  auto h_labels = info.labels.HostView().Slice(linalg::All(), 0);
+
+  CheckNDCGLabels(this->Param(), h_labels,
+                  [](auto beg, auto end, auto op) { return std::none_of(beg, end, op); });
+
+  inv_idcg_.Reshape(n_groups);
+  auto h_inv_idcg = inv_idcg_.HostView();
+  std::size_t topk = this->Param().TopK();
+  auto const exp_gain = this->Param().ndcg_exp_gain;
+
+  common::ParallelFor(n_groups, ctx->Threads(), [&](auto g) {
+    auto g_labels = h_labels.Slice(linalg::Range(h_group_ptr[g], h_group_ptr[g + 1]));
+    auto sorted_idx = common::ArgSort<std::size_t>(ctx, linalg::cbegin(g_labels),
+                                                   linalg::cend(g_labels), std::greater<>{});
+
+    double idcg{0.0};
+    for (std::size_t i = 0; i < std::min(g_labels.Size(), topk); ++i) {
+      if (exp_gain) {
+        idcg += h_discounts[i] * CalcDCGGain(g_labels(sorted_idx[i]));
+      } else {
+        idcg += h_discounts[i] * g_labels(sorted_idx[i]);
+      }
+    }
+    h_inv_idcg(g) = CalcInvIDCG(idcg);
+  });
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 DMLC_REGISTER_PARAMETER(LambdaRankParam);

+void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
+  auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
+  CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
  std::string out_name;
  if (!param.empty()) {
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -0,0 +1,212 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include <thrust/functional.h>                  // for maximum
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/logical.h>                     // for none_of, all_of
+#include <thrust/pair.h>                        // for pair, make_pair
+#include <thrust/reduce.h>                      // for reduce
+#include <thrust/scan.h>                        // for inclusive_scan
+
+#include <cstddef>                              // for size_t
+
+#include "algorithm.cuh"                        // for SegmentedArgSort
+#include "cuda_context.cuh"                     // for CUDAContext
+#include "device_helpers.cuh"                   // for MakeTransformIterator, LaunchN
+#include "optional_weight.h"                    // for MakeOptionalWeights, OptionalWeights
+#include "ranking_utils.cuh"                    // for ThreadsForMean
+#include "ranking_utils.h"
+#include "threading_utils.cuh"                  // for SegmentedTrapezoidThreads
+#include "xgboost/base.h"                       // for XGBOOST_DEVICE, bst_group_t
+#include "xgboost/context.h"                    // for Context
+#include "xgboost/linalg.h"                     // for VectorView, All, Range
+#include "xgboost/logging.h"                    // for CHECK
+#include "xgboost/span.h"                       // for Span
+
+namespace xgboost::ltr {
+namespace cuda_impl {
+void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
+                    common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
+                    common::Span<bst_group_t const> d_group_ptr, std::size_t k,
+                    linalg::VectorView<double> out_dcg) {
+  CHECK_EQ(d_group_ptr.size() - 1, out_dcg.Size());
+  using IdxGroup = thrust::pair<std::size_t, std::size_t>;
+  auto group_it = dh::MakeTransformIterator<IdxGroup>(
+      thrust::make_counting_iterator(0ull), [=] XGBOOST_DEVICE(std::size_t idx) {
+        return thrust::make_pair(idx, dh::SegmentId(d_group_ptr, idx));  // NOLINT
+      });
+  auto value_it = dh::MakeTransformIterator<double>(
+      group_it,
+      [exp_gain, d_labels, d_group_ptr, k,
+       d_sorted_idx] XGBOOST_DEVICE(IdxGroup const& l) -> double {
+        auto g_begin = d_group_ptr[l.second];
+        auto g_size = d_group_ptr[l.second + 1] - g_begin;
+
+        auto idx_in_group = l.first - g_begin;
+        if (idx_in_group >= k) {
+          return 0.0;
+        }
+        double gain{0.0};
+        auto g_sorted_idx = d_sorted_idx.subspan(g_begin, g_size);
+        auto g_labels = d_labels.Slice(linalg::Range(g_begin, g_begin + g_size));
+
+        if (exp_gain) {
+          gain = ltr::CalcDCGGain(g_labels(g_sorted_idx[idx_in_group]));
+        } else {
+          gain = g_labels(g_sorted_idx[idx_in_group]);
+        }
+        double discount = CalcDCGDiscount(idx_in_group);
+        return gain * discount;
+      });
+
+  CHECK(out_dcg.Contiguous());
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
+                                  d_group_ptr.size() - 1, d_group_ptr.data(),
+                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
+                                  d_group_ptr.size() - 1, d_group_ptr.data(),
+                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+}
+
+void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
+                        common::Span<bst_group_t const> d_group_ptr,
+                        linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const& p) {
+  CHECK_GE(d_group_ptr.size(), 2ul);
+  size_t n_groups = d_group_ptr.size() - 1;
+  CHECK_EQ(out_inv_IDCG.Size(), n_groups);
+  dh::device_vector<std::size_t> sorted_idx(d_labels.Size());
+  auto d_sorted_idx = dh::ToSpan(sorted_idx);
+  common::SegmentedArgSort<false, true>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
+  CalcQueriesDCG(ctx, d_labels, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(), out_inv_IDCG);
+  dh::LaunchN(out_inv_IDCG.Size(), ctx->CUDACtx()->Stream(),
+              [out_inv_IDCG] XGBOOST_DEVICE(size_t idx) mutable {
+                double idcg = out_inv_IDCG(idx);
+                out_inv_IDCG(idx) = CalcInvIDCG(idcg);
+              });
+}
+}  // namespace cuda_impl
+
+namespace {
+struct CheckNDCGOp {
+  CUDAContext const* cuctx;
+  template <typename It, typename Op>
+  bool operator()(It beg, It end, Op op) {
+    return thrust::none_of(cuctx->CTP(), beg, end, op);
+  }
+};
+struct CheckMAPOp {
+  CUDAContext const* cuctx;
+  template <typename It, typename Op>
+  bool operator()(It beg, It end, Op op) {
+    return thrust::all_of(cuctx->CTP(), beg, end, op);
+  }
+};
+
+struct ThreadGroupOp {
+  common::Span<bst_group_t const> d_group_ptr;
+  std::size_t n_pairs;
+
+  common::Span<std::size_t> out_thread_group_ptr;
+
+  XGBOOST_DEVICE void operator()(std::size_t i) {
+    out_thread_group_ptr[i + 1] =
+        cuda_impl::ThreadsForMean(d_group_ptr[i + 1] - d_group_ptr[i], n_pairs);
+  }
+};
+
+struct GroupSizeOp {
+  common::Span<bst_group_t const> d_group_ptr;
+
+  XGBOOST_DEVICE auto operator()(std::size_t i) -> std::size_t {
+    return d_group_ptr[i + 1] - d_group_ptr[i];
+  }
+};
+
+struct WeightOp {
+  common::OptionalWeights d_weight;
+  XGBOOST_DEVICE auto operator()(std::size_t i) -> double { return d_weight[i]; }
+};
+}  // anonymous namespace
+
+void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  CUDAContext const* cuctx = ctx->CUDACtx();
+
+  group_ptr_.SetDevice(ctx->gpu_id);
+  if (info.group_ptr_.empty()) {
+    group_ptr_.Resize(2, 0);
+    group_ptr_.HostVector()[1] = info.num_row_;
+  } else {
+    auto const& h_group_ptr = info.group_ptr_;
+    group_ptr_.Resize(h_group_ptr.size());
+    auto d_group_ptr = group_ptr_.DeviceSpan();
+    dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
+                                  cudaMemcpyHostToDevice, cuctx->Stream()));
+  }
+
+  auto d_group_ptr = DataGroupPtr(ctx);
+  std::size_t n_groups = Groups();
+
+  auto it = dh::MakeTransformIterator<std::size_t>(thrust::make_counting_iterator(0ul),
+                                                   GroupSizeOp{d_group_ptr});
+  max_group_size_ =
+      thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
+
+  threads_group_ptr_.SetDevice(ctx->gpu_id);
+  threads_group_ptr_.Resize(n_groups + 1, 0);
+  auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
+  if (param_.HasTruncation()) {
+    n_cuda_threads_ =
+        common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
+  } else {
+    auto n_pairs = Param().NumPair();
+    dh::LaunchN(n_groups, cuctx->Stream(),
+                ThreadGroupOp{d_group_ptr, n_pairs, d_threads_group_ptr});
+    thrust::inclusive_scan(cuctx->CTP(), dh::tcbegin(d_threads_group_ptr),
+                           dh::tcend(d_threads_group_ptr), dh::tbegin(d_threads_group_ptr));
+    n_cuda_threads_ = info.num_row_ * param_.NumPair();
+  }
+
+  sorted_idx_cache_.SetDevice(ctx->gpu_id);
+  sorted_idx_cache_.Resize(info.labels.Size(), 0);
+
+  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
+  auto w_it =
+      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), WeightOp{weight});
+  weight_norm_ = static_cast<double>(n_groups) / thrust::reduce(w_it, w_it + n_groups);
+}
+
+common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
+                                                             common::Span<float const> predt) {
+  auto d_sorted_idx = sorted_idx_cache_.DeviceSpan();
+  auto d_group_ptr = DataGroupPtr(ctx);
+  common::SegmentedArgSort<false, true>(ctx, predt, d_group_ptr, d_sorted_idx);
+  return d_sorted_idx;
+}
+
+void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  CUDAContext const* cuctx = ctx->CUDACtx();
+  auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
+
+  auto d_group_ptr = this->DataGroupPtr(ctx);
+
+  std::size_t n_groups = d_group_ptr.size() - 1;
+  inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
+  auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
+  cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
+  CHECK_GE(this->Param().NumPair(), 1ul);
+
+  discounts_.SetDevice(ctx->gpu_id);
+  discounts_.Resize(MaxGroupSize());
+  auto d_discount = discounts_.DeviceSpan();
+  dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
+              [=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
+}
+
+void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
+}
+}  // namespace xgboost::ltr
--- a/src/common/ranking_utils.cuh
+++ b/src/common/ranking_utils.cuh
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_RANKING_UTILS_CUH_
+#define XGBOOST_COMMON_RANKING_UTILS_CUH_
+
+#include <cstddef>            // for size_t
+
+#include "ranking_utils.h"    // for LambdaRankParam
+#include "xgboost/base.h"     // for bst_group_t, XGBOOST_DEVICE
+#include "xgboost/context.h"  // for Context
+#include "xgboost/linalg.h"   // for VectorView
+#include "xgboost/span.h"     // for Span
+
+namespace xgboost {
+namespace ltr {
+namespace cuda_impl {
+void CalcQueriesDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
+                    common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
+                    common::Span<bst_group_t const> d_group_ptr, std::size_t k,
+                    linalg::VectorView<double> out_dcg);
+
+void CalcQueriesInvIDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
+                        common::Span<bst_group_t const> d_group_ptr,
+                        linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const &p);
+
+// Functions for creating number of threads for CUDA, and getting back the number of pairs
+// from the number of threads.
+XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size,
+                                                          std::size_t n_pairs) {
+  return group_size * n_pairs;
+}
+XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
+                                                         std::size_t group_size) {
+  return n_threads / group_size;
+}
+}  // namespace cuda_impl
+}  // namespace ltr
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_RANKING_UTILS_CUH_
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -11,7 +11,6 @@
 #include <string>                        // for char_traits, string
 #include <vector>                        // for vector

-#include "./math.h"                      // for CloseTo
 #include "dmlc/parameter.h"              // for FieldEntry, DMLC_DECLARE_FIELD
 #include "error_msg.h"                   // for GroupWeight, GroupSize
 #include "xgboost/base.h"                // for XGBOOST_DEVICE, bst_group_t
@@ -19,7 +18,7 @@
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/linalg.h"              // for Vector, VectorView, Tensor
-#include "xgboost/logging.h"             // for LogCheck_EQ, CHECK_EQ, CHECK
+#include "xgboost/logging.h"             // for CHECK_EQ, CHECK
 #include "xgboost/parameter.h"           // for XGBoostParameter
 #include "xgboost/span.h"                // for Span
 #include "xgboost/string_view.h"         // for StringView
@@ -34,6 +33,25 @@ using rel_degree_t = std::uint32_t;  // NOLINT
 */
 using position_t = std::uint32_t;  // NOLINT

+/**
+ * \brief Maximum relevance degree for NDCG
+ */
+constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
+static_assert(MaxRel() == 31);
+
+XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
+  return static_cast<double>((1u << label) - 1);
+}
+
+XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
+  return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
+}
+
+XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
+  auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg));  // handle irrelevant document
+  return inv_idcg;
+}
+
 enum class PairMethod : std::int32_t {
  kTopK = 0,
  kMean = 1,
@@ -115,7 +133,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
        .describe("Number of pairs for each sample in the list.");
    DMLC_DECLARE_FIELD(lambdarank_unbiased)
        .set_default(false)
-        .describe("Unbiased lambda mart. Use IPW to debias click position");
+        .describe("Unbiased lambda mart. Use extended IPW to debias click position");
    DMLC_DECLARE_FIELD(lambdarank_bias_norm)
        .set_default(2.0)
        .set_lower_bound(0.0)
@@ -126,6 +144,285 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
  }
 };

+/**
+ * \brief Common cached items for ranking tasks.
+ */
+class RankingCache {
+ private:
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+  // Cached parameter
+  LambdaRankParam param_;
+  // offset to data groups.
+  HostDeviceVector<bst_group_t> group_ptr_;
+  // store the sorted index of prediction.
+  HostDeviceVector<std::size_t> sorted_idx_cache_;
+  // Maximum size of group
+  std::size_t max_group_size_{0};
+  // Normalization for weight
+  double weight_norm_{1.0};
+  /**
+   * CUDA cache
+   */
+  // offset to threads assigned to each group for gradient calculation
+  HostDeviceVector<std::size_t> threads_group_ptr_;
+  // Sorted index of label for finding buckets.
+  HostDeviceVector<std::size_t> y_sorted_idx_cache_;
+  // Cached labels sorted by the model
+  HostDeviceVector<float> y_ranked_by_model_;
+  // store rounding factor for objective for each group
+  linalg::Vector<GradientPair> roundings_;
+  // rounding factor for cost
+  HostDeviceVector<double> cost_rounding_;
+  // temporary storage for creating rounding factors. Stored as byte to avoid having cuda
+  // data structure in here.
+  HostDeviceVector<std::uint8_t> max_lambdas_;
+  // total number of cuda threads used for gradient calculation
+  std::size_t n_cuda_threads_{0};
+
+  // Create model rank list on GPU
+  common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
+                                                 common::Span<float const> predt);
+  // Create model rank list on CPU
+  common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
+                                                common::Span<float const> predt);
+
+ protected:
+  [[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
+
+ public:
+  RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
+    CHECK(param_.GetInitialised());
+    if (!info.group_ptr_.empty()) {
+      CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
+          << error::GroupSize() << "the size of label.";
+    }
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+    if (!info.weights_.Empty()) {
+      CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
+    }
+  }
+  [[nodiscard]] std::size_t MaxPositionSize() const {
+    // Use truncation level as bound.
+    if (param_.HasTruncation()) {
+      return param_.NumPair();
+    }
+    // Hardcoded maximum size of positions to track. We don't need too many of them as the
+    // bias decreases exponentially.
+    return std::min(max_group_size_, static_cast<std::size_t>(32));
+  }
+  // Constructed as [1, n_samples] if group ptr is not supplied by the user
+  common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
+    group_ptr_.SetDevice(ctx->gpu_id);
+    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
+  }
+
+  [[nodiscard]] auto const& Param() const { return param_; }
+  [[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
+  [[nodiscard]] double WeightNorm() const { return weight_norm_; }
+
+  // Create a rank list by model prediction
+  common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
+    if (sorted_idx_cache_.Empty()) {
+      sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      sorted_idx_cache_.Resize(predt.size());
+    }
+    if (ctx->IsCPU()) {
+      return this->MakeRankOnCPU(ctx, predt);
+    } else {
+      return this->MakeRankOnCUDA(ctx, predt);
+    }
+  }
+  // The function simply returns a uninitialized buffer as this is only used by the
+  // objective for creating pairs.
+  common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
+    CHECK(ctx->IsCUDA());
+    if (y_sorted_idx_cache_.Empty()) {
+      y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      y_sorted_idx_cache_.Resize(n_samples);
+    }
+    return y_sorted_idx_cache_.DeviceSpan();
+  }
+  common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
+    CHECK(ctx->IsCUDA());
+    if (y_ranked_by_model_.Empty()) {
+      y_ranked_by_model_.SetDevice(ctx->gpu_id);
+      y_ranked_by_model_.Resize(n_samples);
+    }
+    return y_ranked_by_model_.DeviceSpan();
+  }
+
+  // CUDA cache getters, the cache is shared between metric and objective, some of these
+  // fields are lazy initialized to avoid unnecessary allocation.
+  [[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
+    CHECK(!threads_group_ptr_.Empty());
+    return threads_group_ptr_.ConstDeviceSpan();
+  }
+  [[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
+
+  linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
+    if (roundings_.Size() == 0) {
+      roundings_.SetDevice(ctx->gpu_id);
+      roundings_.Reshape(Groups());
+    }
+    return roundings_.View(ctx->gpu_id);
+  }
+  common::Span<double> CUDACostRounding(Context const* ctx) {
+    if (cost_rounding_.Size() == 0) {
+      cost_rounding_.SetDevice(ctx->gpu_id);
+      cost_rounding_.Resize(1);
+    }
+    return cost_rounding_.DeviceSpan();
+  }
+  template <typename Type>
+  common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
+    max_lambdas_.SetDevice(ctx->gpu_id);
+    std::size_t bytes = n * sizeof(Type);
+    if (bytes != max_lambdas_.Size()) {
+      max_lambdas_.Resize(bytes);
+    }
+    return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
+  }
+};
+
+class NDCGCache : public RankingCache {
+  // NDCG discount
+  HostDeviceVector<double> discounts_;
+  // 1.0 / IDCG
+  linalg::Vector<double> inv_idcg_;
+  /**
+   * CUDA cache
+   */
+  // store the intermediate DCG calculation result for metric
+  linalg::Vector<double> dcg_;
+
+ public:
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+
+ public:
+  NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
+      : RankingCache{ctx, info, p} {
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+  }
+
+  linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
+    return inv_idcg_.View(ctx->gpu_id);
+  }
+  common::Span<double const> Discount(Context const* ctx) const {
+    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
+  }
+  linalg::VectorView<double> Dcg(Context const* ctx) {
+    if (dcg_.Size() == 0) {
+      dcg_.SetDevice(ctx->gpu_id);
+      dcg_.Reshape(this->Groups());
+    }
+    return dcg_.View(ctx->gpu_id);
+  }
+};
+
+/**
+ * \brief Validate label for NDCG
+ *
+ * \tparam NoneOf Implementation of std::none_of. Specified as a parameter to reuse the
+ *                check for both CPU and GPU.
+ */
+template <typename NoneOf>
+void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float const> labels,
+                     NoneOf none_of) {
+  auto d_labels = labels.Values();
+  if (p.ndcg_exp_gain) {
+    auto label_is_integer =
+        none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
+          auto l = std::floor(v);
+          return std::fabs(l - v) > kRtEps || v < 0.0f;
+        });
+    CHECK(label_is_integer)
+        << "When using relevance degree as target, label must be either 0 or positive integer.";
+  }
+
+  if (p.ndcg_exp_gain) {
+    auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
+                                  [] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
+    CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
+                          << " when the exponential NDCG gain function is used. "
+                          << "Set `ndcg_exp_gain` to false to use custom DCG gain.";
+  }
+}
+
+template <typename AllOf>
+bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
+  auto s_label = label.Values();
+  return all_of(s_label.data(), s_label.data() + s_label.size(), [] XGBOOST_DEVICE(float y) {
+    return std::abs(y - 1.0f) < kRtEps || std::abs(y - 0.0f) < kRtEps;
+  });
+}
+/**
+ * \brief Validate label for MAP
+ *
+ * \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
+ *         both CPU and GPU.
+ */
+template <typename AllOf>
+void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
+  auto s_label = label.Values();
+  auto is_binary = IsBinaryRel(label, all_of);
+  CHECK(is_binary) << "MAP can only be used with binary labels.";
+}
+
+class MAPCache : public RankingCache {
+  // Total number of relevant documents for each group
+  HostDeviceVector<double> n_rel_;
+  // \sum l_k/k
+  HostDeviceVector<double> acc_;
+  HostDeviceVector<double> map_;
+  // Number of samples in this dataset.
+  std::size_t n_samples_{0};
+
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+
+ public:
+  MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
+      : RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+  }
+
+  common::Span<double> NumRelevant(Context const* ctx) {
+    if (n_rel_.Empty()) {
+      n_rel_.SetDevice(ctx->gpu_id);
+      n_rel_.Resize(n_samples_);
+    }
+    return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
+  }
+  common::Span<double> Acc(Context const* ctx) {
+    if (acc_.Empty()) {
+      acc_.SetDevice(ctx->gpu_id);
+      acc_.Resize(n_samples_);
+    }
+    return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
+  }
+  common::Span<double> Map(Context const* ctx) {
+    if (map_.Empty()) {
+      map_.SetDevice(ctx->gpu_id);
+      map_.Resize(this->Groups());
+    }
+    return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
+  }
+};
+
 /**
 * \brief Parse name for ranking metric given parameters.
 *
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -8,9 +8,11 @@
 #include <dmlc/omp.h>

 #include <algorithm>
-#include <cstdint>  // std::int32_t
+#include <cstdint>  // for int32_t
+#include <cstdlib>  // for malloc, free
 #include <limits>
-#include <type_traits>  // std::is_signed
+#include <new>          // for bad_alloc
+#include <type_traits>  // for is_signed
 #include <vector>

 #include "xgboost/logging.h"
@@ -266,7 +268,7 @@ class MemStackAllocator {
    if (MaxStackSize >= required_size_) {
      ptr_ = stack_mem_;
    } else {
-      ptr_ = reinterpret_cast<T*>(malloc(required_size_ * sizeof(T)));
+      ptr_ = reinterpret_cast<T*>(std::malloc(required_size_ * sizeof(T)));
    }
    if (!ptr_) {
      throw std::bad_alloc{};
@@ -278,7 +280,7 @@ class MemStackAllocator {

  ~MemStackAllocator() {
    if (required_size_ > MaxStackSize) {
-      free(ptr_);
+      std::free(ptr_);
    }
  }
  T& operator[](size_t i) { return ptr_[i]; }