From 79efcd37f5638a80ddd2b8bee94dc8e7ec008fab Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 10 Mar 2023 12:51:43 -0800
Subject: [PATCH 01/32] Pick up dmlc-core fix for CSV parser (#8897)

---
 dmlc-core | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/dmlc-core b/dmlc-core
index 81db53948..ea21135fb 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 81db539486ce6525b31b971545edffee2754aced
+Subproject commit ea21135fbb141ae103fb5fc960289b5601b468f2

From 36a73966586daccdb492ebc129cdef69b700732a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 11 Mar 2023 06:11:04 +0800
Subject: [PATCH 02/32] Replace dmlc any with std any. (#8892)

---
 include/xgboost/gbm.h                     |  1 -
 plugin/updater_oneapi/predictor_oneapi.cc | 23 ++++++-------
 src/data/proxy_dmatrix.cuh                | 21 +++++-------
 src/data/proxy_dmatrix.h                  | 25 +++++---------
 src/predictor/cpu_predictor.cc            | 10 +++---
 src/predictor/gpu_predictor.cu            | 11 +++---
 tests/cpp/data/test_file_iterator.cc      | 13 ++++---
 tests/cpp/data/test_proxy_dmatrix.cu      | 41 ++++++++++-------------
 8 files changed, 64 insertions(+), 81 deletions(-)

diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index d00f9ceaf..07758a524 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -9,7 +9,6 @@
 #define XGBOOST_GBM_H_
 
 #include <dmlc/registry.h>
-#include <dmlc/any.h>
 #include <xgboost/base.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
diff --git a/plugin/updater_oneapi/predictor_oneapi.cc b/plugin/updater_oneapi/predictor_oneapi.cc
index eafe83e19..59b170b28 100755
--- a/plugin/updater_oneapi/predictor_oneapi.cc
+++ b/plugin/updater_oneapi/predictor_oneapi.cc
@@ -1,23 +1,22 @@
 /*!
  * Copyright by Contributors 2017-2020
  */
+#include <any>  // for any
 #include <cstddef>
 #include <limits>
 #include <mutex>
 
+#include "../../src/common/math.h"
+#include "../../src/data/adapter.h"
+#include "../../src/gbm/gbtree_model.h"
+#include "CL/sycl.hpp"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/logging.h"
 #include "xgboost/predictor.h"
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
-#include "xgboost/logging.h"
-#include "xgboost/host_device_vector.h"
-
-#include "../../src/data/adapter.h"
-#include "../../src/common/math.h"
-#include "../../src/gbm/gbtree_model.h"
-
-#include "CL/sycl.hpp"
 
 namespace xgboost {
 namespace predictor {
@@ -200,7 +199,7 @@ class DeviceModelOneAPI {
 
     tree_beg_ = tree_begin;
     tree_end_ = tree_end;
-    num_group = model.learner_model_param->num_output_group; 
+    num_group = model.learner_model_param->num_output_group;
   }
 };
 
@@ -396,9 +395,9 @@ class PredictorOneAPI : public Predictor {
           out_preds->Size() == dmat->Info().num_row_);
   }
 
-  void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
-                      float missing, PredictionCacheEntry *out_preds,
-                      uint32_t tree_begin, unsigned tree_end) const override {
+  void InplacePredict(std::any const& x, const gbm::GBTreeModel& model, float missing,
+                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
+                      unsigned tree_end) const override {
     cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end);
   }
 
diff --git a/src/data/proxy_dmatrix.cuh b/src/data/proxy_dmatrix.cuh
index 38cbffe50..6ea858e7e 100644
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -1,27 +1,24 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  */
+#include <any>  // for any, any_cast
+
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 template <typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
   if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<CupyAdapter>>(
-        proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
     return fn(value);
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
-        proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
     return fn(value);
   } else {
     LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    auto value = dmlc::get<std::shared_ptr<CudfAdapter>>(
-        proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
     return fn(value);
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index fa55a481f..7a15d6498 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -1,11 +1,10 @@
-/*!
- * Copyright 2020-2022, XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
 #define XGBOOST_DATA_PROXY_DMATRIX_H_
 
-#include <dmlc/any.h>
-
+#include <any>  // for any, any_cast
 #include <memory>
 #include <string>
 #include <utility>
@@ -15,8 +14,7 @@
 #include "xgboost/context.h"
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 /*
  * \brief A proxy to external iterator.
  */
@@ -44,7 +42,7 @@ class DataIterProxy {
  */
 class DMatrixProxy : public DMatrix {
   MetaInfo info_;
-  dmlc::any batch_;
+  std::any batch_;
   Context ctx_;
 
 #if defined(XGBOOST_USE_CUDA)
@@ -115,9 +113,7 @@ class DMatrixProxy : public DMatrix {
     LOG(FATAL) << "Not implemented.";
     return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
   }
-  dmlc::any Adapter() const {
-    return batch_;
-  }
+  std::any Adapter() const { return batch_; }
 };
 
 inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
@@ -131,15 +127,13 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
 template <typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
   if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
-    auto value =
-        dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
     if (type_error) {
       *type_error = false;
     }
     return fn(value);
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
-    auto value = dmlc::get<std::shared_ptr<ArrayAdapter>>(
-        proxy->Adapter())->Value();
+    auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
     if (type_error) {
       *type_error = false;
     }
@@ -154,6 +148,5 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
         decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 288dc5fb0..4473173d2 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -1,9 +1,9 @@
 /**
  * Copyright 2017-2023 by XGBoost Contributors
  */
-#include <dmlc/any.h>
 #include <dmlc/omp.h>
 
+#include <any>  // for any, any_cast
 #include <cstddef>
 #include <limits>
 #include <mutex>
@@ -637,12 +637,12 @@ class CPUPredictor : public Predictor {
   }
 
   template <typename Adapter, size_t kBlockSize>
-  void DispatchedInplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
+  void DispatchedInplacePredict(std::any const &x, std::shared_ptr<DMatrix> p_m,
                                 const gbm::GBTreeModel &model, float missing,
-                                PredictionCacheEntry *out_preds,
-                                uint32_t tree_begin, uint32_t tree_end) const {
+                                PredictionCacheEntry *out_preds, uint32_t tree_begin,
+                                uint32_t tree_end) const {
     auto const n_threads = this->ctx_->Threads();
-    auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
+    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
     CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
         << "Number of columns in data must equal to trained model.";
     if (p_m) {
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index caf4b6bb4..ecd399e22 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -8,6 +8,7 @@
 #include <thrust/fill.h>
 #include <thrust/host_vector.h>
 
+#include <any>  // for any, any_cast
 #include <memory>
 
 #include "../common/bitfield.h"
@@ -741,13 +742,13 @@ class GPUPredictor : public xgboost::Predictor {
   }
 
   template <typename Adapter, typename Loader>
-  void DispatchedInplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
-                                const gbm::GBTreeModel &model, float missing,
-                                PredictionCacheEntry *out_preds,
-                                uint32_t tree_begin, uint32_t tree_end) const {
+  void DispatchedInplacePredict(std::any const& x, std::shared_ptr<DMatrix> p_m,
+                                const gbm::GBTreeModel& model, float missing,
+                                PredictionCacheEntry* out_preds, uint32_t tree_begin,
+                                uint32_t tree_end) const {
     uint32_t const output_groups =  model.learner_model_param->num_output_group;
 
-    auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
+    auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
     CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
         << "Number of columns in data must equal to trained model.";
     CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc
index 21029620b..31da2c1fa 100644
--- a/tests/cpp/data/test_file_iterator.cc
+++ b/tests/cpp/data/test_file_iterator.cc
@@ -1,8 +1,9 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  */
 #include <gtest/gtest.h>
 
+#include <any>  // for any_cast
 #include <memory>
 
 #include "../../../src/data/adapter.h"
@@ -11,15 +12,14 @@
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(FileIterator, Basic) {
   auto check_n_features = [](FileIterator *iter) {
     size_t n_features = 0;
     iter->Reset();
     while (iter->Next()) {
       auto proxy = MakeProxy(iter->Proxy());
-      auto csr = dmlc::get<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
+      auto csr = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
       n_features = std::max(n_features, csr->NumColumns());
     }
     ASSERT_EQ(n_features, 5);
@@ -42,5 +42,4 @@ TEST(FileIterator, Basic) {
     check_n_features(&iter);
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu
index a599ada6d..ab38f51bb 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cu
+++ b/tests/cpp/data/test_proxy_dmatrix.cu
@@ -1,22 +1,24 @@
+/**
+ * Copyright 2020-2023 XGBoost contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
+
+#include <any>  // for any_cast
 #include <memory>
-#include "../helpers.h"
+
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/proxy_dmatrix.h"
+#include "../helpers.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(ProxyDMatrix, DeviceData) {
   constexpr size_t kRows{100}, kCols{100};
   HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5)
-                  .Device(0)
-                  .GenerateArrayInterface(&storage);
+  auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage);
   std::vector<HostDeviceVector<float>> label_storage(1);
-  auto labels = RandomDataGenerator(kRows, 1, 0)
-                    .Device(0)
-                    .GenerateColumnarArrayInterface(&label_storage);
+  auto labels =
+      RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage);
 
   DMatrixProxy proxy;
   proxy.SetCUDAArray(data.c_str());
@@ -24,23 +26,16 @@ TEST(ProxyDMatrix, DeviceData) {
 
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CupyAdapter>));
   ASSERT_EQ(proxy.Info().labels.Size(), kRows);
-  ASSERT_EQ(dmlc::get<std::shared_ptr<CupyAdapter>>(proxy.Adapter())->NumRows(),
-            kRows);
-  ASSERT_EQ(
-      dmlc::get<std::shared_ptr<CupyAdapter>>(proxy.Adapter())->NumColumns(),
-      kCols);
+  ASSERT_EQ(std::any_cast<std::shared_ptr<CupyAdapter>>(proxy.Adapter())->NumRows(), kRows);
+  ASSERT_EQ(std::any_cast<std::shared_ptr<CupyAdapter>>(proxy.Adapter())->NumColumns(), kCols);
 
   std::vector<HostDeviceVector<float>> columnar_storage(kCols);
   data = RandomDataGenerator(kRows, kCols, 0)
-                    .Device(0)
-                    .GenerateColumnarArrayInterface(&columnar_storage);
+             .Device(0)
+             .GenerateColumnarArrayInterface(&columnar_storage);
   proxy.SetCUDAArray(data.c_str());
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
-  ASSERT_EQ(dmlc::get<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumRows(),
-            kRows);
-  ASSERT_EQ(
-      dmlc::get<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumColumns(),
-      kCols);
+  ASSERT_EQ(std::any_cast<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumRows(), kRows);
+  ASSERT_EQ(std::any_cast<std::shared_ptr<CudfAdapter>>(proxy.Adapter())->NumColumns(), kCols);
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data

From 3689695d16c3fc3b160d6917a55cb899932f91d4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 12 Mar 2023 03:14:31 +0800
Subject: [PATCH 03/32] [CI] Run RMM gtests. (#8900)

* [CI] Run RMM gtests.

* Update test-cpp-gpu.sh

---------

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
---
 tests/buildkite/test-cpp-gpu.sh | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh
index 75a600d7a..7c8f5e505 100755
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -12,13 +12,12 @@ tests/ci_build/ci_build.sh gpu nvidia-docker \
   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
   build/testxgboost
 
-# Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved
-# echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
-# rm -rfv build/
-# buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
-# chmod +x build/testxgboost
-# tests/ci_build/ci_build.sh rmm nvidia-docker \
-#   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
-#   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
-#   --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
-#   "source activate gpu_test && build/testxgboost --use-rmm-pool"
+echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
+rm -rfv build/
+buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
+chmod +x build/testxgboost
+tests/ci_build/ci_build.sh rmm nvidia-docker \
+  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
+  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
+  "source activate gpu_test && build/testxgboost --use-rmm-pool"

From bbee355b452237da391feac369cb907019ee26c4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 13 Mar 2023 19:30:35 +0800
Subject: [PATCH 04/32] [doc][dask] Note on reproducible result. [skip ci]
 (#8903)

---
 doc/tutorials/dask.rst | 64 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index c010aa0e2..6608a8594 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -564,6 +564,70 @@ computations, one can explicitly wait for results of input data before construct
 Also dask's `diagnostics dashboard <https://distributed.dask.org/en/latest/web.html>`_ can be used to
 monitor what operations are currently being performed.
 
+*******************
+Reproducible Result
+*******************
+
+In a single node mode, we can always expect the same training result between runs as along
+as the underlying platforms are the same. However, it's difficult to obtain reproducible
+result in a distributed environment, since the tasks might get different machine
+allocation or have different amount of available resources during different
+sessions. There are heuristics and guidelines on how to achieve it but no proven method
+for guaranteeing such deterministic behavior. The Dask interface in XGBoost tries to
+provide reproducible result with best effort. This section highlights some known criteria
+and try share some insight into the issue.
+
+There are primarily two different tasks for XGBoost the carry out, training and
+inference. Inference is reproducible given the same software and hardware along with the
+same run-time configurations like number of threads. The remaining of this section will
+focus on training.
+
+Many of the challenges come from the fact that we are using approximation algorithms, The
+sketching algorithm used to find histogram bins is an approximation to the exact quantile
+algorithm, the `AUC` metric in a distributed environment is an approximation to the exact
+`AUC` score, and floating-point number if an approximation to real numbers. Floating point
+is an issue as its summation is not associative, meaning :math:`(a + b) + c` does not
+necessarily equal to :math:`a + (b + c)`, even though this property holds true for real
+number. As a result, whenever we change the order of summation, the result can
+differ. This imposes the requirement that, in order to have reproducible output from
+XGBoost, the entire pipeline needs to be reproducible.
+
+- The software stack is the same for each runs. This goes without saying. XGBoost might
+  generate different outputs between different versions. This is expected as we might
+  change the default value of hyper-parameter, or the parallel strategy that generates
+  different floating point result. We guarantee the correctness the algorithms, but there
+  are lots of wiggle room for the final output. The situation is similar for many
+  dependencies, for instance, the random number generator might differ from platform to
+  platform.
+
+- The hardware stack is the same for each runs. This includes the number of workers, and
+  the amount of available resources on each worker. XGBoost can generate different results
+  using different number of workers. This is caused by the approximation issue mentioned
+  previously.
+
+- Similar to the hardware constraint, the network topology is also a factor in final
+  output. If we change topology the workers might be ordered differently, leading to
+  different ordering of floating-point operations.
+
+- The random seed used in various place of the pipeline.
+
+- The partitioning of data needs to be reproducible. This is related to the available
+  resources on each worker. Dask might partition the data differently for each run
+  according to its own scheduling policy. For instance, if there are some additional tasks
+  in the cluster while you are running the second training session for XGBoost, some of
+  the workers might have constrained memory and Dask may not push the training data for
+  XGBoost to that worker. This change in data partitioning can lead to different output
+  models. If you are using a shared Dask cluster, then the result is likely to vary
+  between runs.
+
+- The operations performed on dataframes need to be reproducible. There are some
+  operations like `DataFrame.merge` not being deterministic on parallel hardwares like GPU
+  where the order of the index of merge result might differ from run to run.
+
+It's expected to have different results when training the model on distributed environment
+than training the model using a single node due to aforementioned criteria.
+
+
 ************
 Memory Usage
 ************

From 5ba3509dd31fe2bf12116e83e5156354cb439d20 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 13 Mar 2023 19:31:05 +0800
Subject: [PATCH 05/32] Define multi expand entry. (#8895)

---
 src/tree/hist/expand_entry.h                | 121 +++++++++++++++-----
 src/tree/updater_approx.cc                  |   4 +-
 src/tree/updater_quantile_hist.cc           |   6 +-
 tests/cpp/tree/hist/test_evaluate_splits.cc |   3 +-
 tests/cpp/tree/hist/test_histogram.cc       |  30 +++--
 tests/cpp/tree/test_approx.cc               |  18 +--
 tests/cpp/tree/test_quantile_hist.cc        |   3 +-
 7 files changed, 125 insertions(+), 60 deletions(-)

diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h
index 885a109bf..acd6edf2b 100644
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -1,29 +1,51 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  */
 #ifndef XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
 #define XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
 
-#include <utility>
-#include "../param.h"
+#include <algorithm>       // for all_of
+#include <ostream>         // for ostream
+#include <utility>         // for move
+#include <vector>          // for vector
 
-namespace xgboost {
-namespace tree {
+#include "../param.h"      // for SplitEntry, SplitEntryContainer, TrainParam
+#include "xgboost/base.h"  // for GradientPairPrecise, bst_node_t
 
-struct CPUExpandEntry {
-  int nid;
-  int depth;
-  SplitEntry split;
-  CPUExpandEntry() = default;
-  XGBOOST_DEVICE
-  CPUExpandEntry(int nid, int depth, SplitEntry split)
-      : nid(nid), depth(depth), split(std::move(split)) {}
-  CPUExpandEntry(int nid, int depth, float loss_chg)
-      : nid(nid), depth(depth)  {
-    split.loss_chg = loss_chg;
+namespace xgboost::tree {
+/**
+ * \brief Structure for storing tree split candidate.
+ */
+template <typename Impl>
+struct ExpandEntryImpl {
+  bst_node_t nid;
+  bst_node_t depth;
+
+  [[nodiscard]] float GetLossChange() const {
+    return static_cast<Impl const*>(this)->split.loss_chg;
+  }
+  [[nodiscard]] bst_node_t GetNodeId() const { return nid; }
+
+  static bool ChildIsValid(TrainParam const& param, bst_node_t depth, bst_node_t num_leaves) {
+    if (param.max_depth > 0 && depth >= param.max_depth) return false;
+    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
+    return true;
   }
 
-  bool IsValid(const TrainParam& param, int num_leaves) const {
+  [[nodiscard]] bool IsValid(TrainParam const& param, bst_node_t num_leaves) const {
+    return static_cast<Impl const*>(this)->IsValidImpl(param, num_leaves);
+  }
+};
+
+struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
+  SplitEntry split;
+
+  CPUExpandEntry() = default;
+  CPUExpandEntry(bst_node_t nidx, bst_node_t depth, SplitEntry split)
+      : ExpandEntryImpl{nidx, depth}, split(std::move(split)) {}
+  CPUExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {}
+
+  [[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const {
     if (split.loss_chg <= kRtEps) return false;
     if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
       return false;
@@ -40,16 +62,7 @@ struct CPUExpandEntry {
     return true;
   }
 
-  float GetLossChange() const { return split.loss_chg; }
-  bst_node_t GetNodeId() const { return nid; }
-
-  static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
-    if (param.max_depth > 0 && depth >= param.max_depth) return false;
-    if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
-    return true;
-  }
-
-  friend std::ostream& operator<<(std::ostream& os, const CPUExpandEntry& e) {
+  friend std::ostream& operator<<(std::ostream& os, CPUExpandEntry const& e) {
     os << "ExpandEntry:\n";
     os << "nidx: " << e.nid << "\n";
     os << "depth: " << e.depth << "\n";
@@ -58,6 +71,54 @@ struct CPUExpandEntry {
     return os;
   }
 };
-}  // namespace tree
-}  // namespace xgboost
+
+struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
+  SplitEntryContainer<std::vector<GradientPairPrecise>> split;
+
+  MultiExpandEntry() = default;
+  MultiExpandEntry(bst_node_t nidx, bst_node_t depth) : ExpandEntryImpl{nidx, depth} {}
+
+  [[nodiscard]] bool IsValidImpl(TrainParam const& param, bst_node_t num_leaves) const {
+    if (split.loss_chg <= kRtEps) return false;
+    auto is_zero = [](auto const& sum) {
+      return std::all_of(sum.cbegin(), sum.cend(),
+                         [&](auto const& g) { return g.GetHess() - .0 == .0; });
+    };
+    if (is_zero(split.left_sum) || is_zero(split.right_sum)) {
+      return false;
+    }
+    if (split.loss_chg < param.min_split_loss) {
+      return false;
+    }
+    if (param.max_depth > 0 && depth == param.max_depth) {
+      return false;
+    }
+    if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
+      return false;
+    }
+    return true;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
+    os << "ExpandEntry: \n";
+    os << "nidx: " << e.nid << "\n";
+    os << "depth: " << e.depth << "\n";
+    os << "loss: " << e.split.loss_chg << "\n";
+    os << "split cond:" << e.split.split_value << "\n";
+    os << "split ind:" << e.split.SplitIndex() << "\n";
+    os << "left_sum: [";
+    for (auto v : e.split.left_sum) {
+      os << v << ", ";
+    }
+    os << "]\n";
+
+    os << "right_sum: [";
+    for (auto v : e.split.right_sum) {
+      os << v << ", ";
+    }
+    os << "]\n";
+    return os;
+  }
+};
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_EXPAND_ENTRY_H_
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 5af2721a6..fd636d3a3 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -226,8 +226,8 @@ class GloablApproxBuilder {
         for (auto const &candidate : valid_candidates) {
           int left_child_nidx = tree[candidate.nid].LeftChild();
           int right_child_nidx = tree[candidate.nid].RightChild();
-          CPUExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx), {}};
-          CPUExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx), {}};
+          CPUExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx)};
+          CPUExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx)};
           best_splits.push_back(l_best);
           best_splits.push_back(r_best);
         }
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 76c402ff5..7d5f6efb3 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -57,7 +57,7 @@ bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
 
 CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
     DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h) {
-  CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0), 0.0f);
+  CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
 
   size_t page_id = 0;
   auto space = ConstructHistSpace(partitioner_, {node});
@@ -197,8 +197,8 @@ void QuantileHistMaker::Builder::ExpandTree(DMatrix *p_fmat, RegTree *p_tree,
       for (auto const &candidate : valid_candidates) {
         int left_child_nidx = tree[candidate.nid].LeftChild();
         int right_child_nidx = tree[candidate.nid].RightChild();
-        CPUExpandEntry l_best{left_child_nidx, depth, 0.0};
-        CPUExpandEntry r_best{right_child_nidx, depth, 0.0};
+        CPUExpandEntry l_best{left_child_nidx, depth};
+        CPUExpandEntry r_best{right_child_nidx, depth};
         best_splits.push_back(l_best);
         best_splits.push_back(r_best);
       }
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index fc94f3130..cf9d78f52 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -98,7 +98,8 @@ TEST(HistEvaluator, Apply) {
   auto sampler = std::make_shared<common::ColumnSampler>();
   auto evaluator_ = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
 
-  CPUExpandEntry entry{0, 0, 10.0f};
+  CPUExpandEntry entry{0, 0};
+  entry.split.loss_chg = 10.0f;
   entry.split.left_sum = GradStats{0.4, 0.6f};
   entry.split.right_sum = GradStats{0.5, 0.5f};
 
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 8462fa7d5..3b354bebb 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -41,10 +41,10 @@ void TestAddHistRows(bool is_distributed) {
   tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3), 0.0f);
-  nodes_for_explicit_hist_build_.emplace_back(4, tree.GetDepth(4), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(5, tree.GetDepth(5), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6), 0.0f);
+  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
+  nodes_for_explicit_hist_build_.emplace_back(4, tree.GetDepth(4));
+  nodes_for_subtraction_trick_.emplace_back(5, tree.GetDepth(5));
+  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
 
   HistogramBuilder<CPUExpandEntry> histogram_builder;
   histogram_builder.Reset(gmat.cut.TotalBins(), {kMaxBins, 0.5}, omp_get_max_threads(), 1,
@@ -98,7 +98,7 @@ void TestSyncHist(bool is_distributed) {
   }
 
   // level 0
-  nodes_for_explicit_hist_build_.emplace_back(0, tree.GetDepth(0), 0.0f);
+  nodes_for_explicit_hist_build_.emplace_back(0, tree.GetDepth(0));
   histogram.AddHistRows(&starting_index, &sync_count,
                         nodes_for_explicit_hist_build_,
                         nodes_for_subtraction_trick_, &tree);
@@ -108,10 +108,8 @@ void TestSyncHist(bool is_distributed) {
   nodes_for_subtraction_trick_.clear();
 
   // level 1
-  nodes_for_explicit_hist_build_.emplace_back(tree[0].LeftChild(),
-                                              tree.GetDepth(1), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(tree[0].RightChild(),
-                                            tree.GetDepth(2), 0.0f);
+  nodes_for_explicit_hist_build_.emplace_back(tree[0].LeftChild(), tree.GetDepth(1));
+  nodes_for_subtraction_trick_.emplace_back(tree[0].RightChild(), tree.GetDepth(2));
 
   histogram.AddHistRows(&starting_index, &sync_count,
                         nodes_for_explicit_hist_build_,
@@ -123,10 +121,10 @@ void TestSyncHist(bool is_distributed) {
   nodes_for_explicit_hist_build_.clear();
   nodes_for_subtraction_trick_.clear();
   // level 2
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(4, tree.GetDepth(4), 0.0f);
-  nodes_for_explicit_hist_build_.emplace_back(5, tree.GetDepth(5), 0.0f);
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6), 0.0f);
+  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
+  nodes_for_subtraction_trick_.emplace_back(4, tree.GetDepth(4));
+  nodes_for_explicit_hist_build_.emplace_back(5, tree.GetDepth(5));
+  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
 
   histogram.AddHistRows(&starting_index, &sync_count,
                         nodes_for_explicit_hist_build_,
@@ -256,7 +254,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
 
-  CPUExpandEntry node(RegTree::kRoot, tree.GetDepth(0), 0.0f);
+  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
   nodes_for_explicit_hist_build.push_back(node);
   for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>({kMaxBins, 0.5})) {
@@ -330,7 +328,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   BatchParam batch_param{0, static_cast<int32_t>(kBins)};
 
   RegTree tree;
-  CPUExpandEntry node(RegTree::kRoot, tree.GetDepth(0), 0.0f);
+  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
   nodes_for_explicit_hist_build.push_back(node);
 
@@ -403,7 +401,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
 
   RegTree tree;
   std::vector<CPUExpandEntry> nodes;
-  nodes.emplace_back(0, tree.GetDepth(0), 0.0f);
+  nodes.emplace_back(0, tree.GetDepth(0));
 
   common::GHistRow multi_page;
   HistogramBuilder<CPUExpandEntry> multi_build;
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index cae76c373..308ae0823 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021-2022, XGBoost contributors.
+/**
+ * Copyright 2021-2023 by XGBoost contributors.
  */
 #include <gtest/gtest.h>
 
@@ -10,7 +10,6 @@
 
 namespace xgboost {
 namespace tree {
-
 namespace {
 std::vector<float> GenerateHess(size_t n_samples) {
   auto grad = GenerateRandomGradients(n_samples);
@@ -32,7 +31,8 @@ TEST(Approx, Partitioner) {
 
   auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
   auto hess = GenerateHess(n_samples);
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
 
   for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
     bst_feature_t const split_ind = 0;
@@ -79,7 +79,9 @@ void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared
                                 CommonRowPartitioner const& expected_mid_partitioner) {
   auto dmat =
       std::unique_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
+
   Context ctx;
   ctx.InitAllowUnknown(Args{});
   for (auto const& page : dmat->GetBatches<GHistIndexMatrix>({64, *hess, true})) {
@@ -124,7 +126,8 @@ TEST(Approx, PartitionerColSplit) {
   size_t n_samples = 1024, n_features = 16, base_rowid = 0;
   auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
   auto hess = GenerateHess(n_samples);
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
 
   float min_value, mid_value;
   Context ctx;
@@ -154,7 +157,8 @@ void TestLeafPartition(size_t n_samples) {
   CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
 
   auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
   RegTree tree;
   std::vector<float> hess(n_samples, 0);
   // emulate sampling
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index ad98d1d6b..42edc2124 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -29,7 +29,8 @@ TEST(QuantileHist, Partitioner) {
   ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
 
   auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
 
   auto cuts = common::SketchOnDMatrix(Xy.get(), 64, ctx.Threads());
 

From 9bade7203a5744ba4d5b226bedb0dbf7a980081f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 13 Mar 2023 20:55:10 +0800
Subject: [PATCH 06/32] Remove public access to tree model param. (#8902)

* Make tree model param a private member.
* Number of features and targets are immutable after construction.

This is to reduce the number of places where we can run configuration.
---
 include/xgboost/tree_model.h                  | 146 +++++++++---------
 src/gbm/gbtree.cc                             |   4 +-
 src/learner.cc                                |   2 -
 src/predictor/cpu_predictor.cc                |   2 +-
 src/tree/tree_model.cc                        |  78 +++++-----
 src/tree/updater_colmaker.cc                  |   6 +-
 src/tree/updater_prune.cc                     |   2 +-
 src/tree/updater_refresh.cc                   |   8 +-
 tests/cpp/tree/test_histmaker.cc              |  14 +-
 .../cpp/tree/test_multi_target_tree_model.cc  |  17 +-
 tests/cpp/tree/test_prune.cc                  |   3 +-
 tests/cpp/tree/test_refresh.cc                |   3 +-
 tests/cpp/tree/test_tree_model.cc             |  14 +-
 tests/cpp/tree/test_tree_stat.cc              |   9 +-
 14 files changed, 149 insertions(+), 159 deletions(-)

diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index f646140dc..61dd94302 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -178,51 +178,33 @@ class RegTree : public Model {
     }
 
     /*! \brief index of left child */
-    XGBOOST_DEVICE [[nodiscard]] int LeftChild() const {
-      return this->cleft_;
-    }
+    [[nodiscard]] XGBOOST_DEVICE int LeftChild() const { return this->cleft_; }
     /*! \brief index of right child */
-    XGBOOST_DEVICE [[nodiscard]] int RightChild() const {
-      return this->cright_;
-    }
+    [[nodiscard]] XGBOOST_DEVICE int RightChild() const { return this->cright_; }
     /*! \brief index of default child when feature is missing */
-    XGBOOST_DEVICE [[nodiscard]] int DefaultChild() const {
+    [[nodiscard]] XGBOOST_DEVICE int DefaultChild() const {
       return this->DefaultLeft() ? this->LeftChild() : this->RightChild();
     }
     /*! \brief feature index of split condition */
-    XGBOOST_DEVICE [[nodiscard]] unsigned SplitIndex() const {
+    [[nodiscard]] XGBOOST_DEVICE unsigned SplitIndex() const {
       return sindex_ & ((1U << 31) - 1U);
     }
     /*! \brief when feature is unknown, whether goes to left child */
-    XGBOOST_DEVICE [[nodiscard]] bool DefaultLeft() const {
-      return (sindex_ >> 31) != 0;
-    }
+    [[nodiscard]] XGBOOST_DEVICE bool DefaultLeft() const { return (sindex_ >> 31) != 0; }
     /*! \brief whether current node is leaf node */
-    XGBOOST_DEVICE [[nodiscard]] bool IsLeaf() const {
-      return cleft_ == kInvalidNodeId;
-    }
+    [[nodiscard]] XGBOOST_DEVICE bool IsLeaf() const { return cleft_ == kInvalidNodeId; }
     /*! \return get leaf value of leaf node */
-    XGBOOST_DEVICE [[nodiscard]] float LeafValue() const {
-      return (this->info_).leaf_value;
-    }
+    [[nodiscard]] XGBOOST_DEVICE float LeafValue() const { return (this->info_).leaf_value; }
     /*! \return get split condition of the node */
-    XGBOOST_DEVICE [[nodiscard]] SplitCondT SplitCond() const {
-      return (this->info_).split_cond;
-    }
+    [[nodiscard]] XGBOOST_DEVICE SplitCondT SplitCond() const { return (this->info_).split_cond; }
     /*! \brief get parent of the node */
-    XGBOOST_DEVICE [[nodiscard]] int Parent() const {
-      return parent_ & ((1U << 31) - 1);
-    }
+    [[nodiscard]] XGBOOST_DEVICE int Parent() const { return parent_ & ((1U << 31) - 1); }
     /*! \brief whether current node is left child */
-    XGBOOST_DEVICE [[nodiscard]] bool IsLeftChild() const {
-      return (parent_ & (1U << 31)) != 0;
-    }
+    [[nodiscard]] XGBOOST_DEVICE bool IsLeftChild() const { return (parent_ & (1U << 31)) != 0; }
     /*! \brief whether this node is deleted */
-    XGBOOST_DEVICE [[nodiscard]] bool IsDeleted() const {
-      return sindex_ == kDeletedNodeMarker;
-    }
+    [[nodiscard]] XGBOOST_DEVICE bool IsDeleted() const { return sindex_ == kDeletedNodeMarker; }
     /*! \brief whether current node is root */
-    XGBOOST_DEVICE [[nodiscard]] bool IsRoot() const { return parent_ == kInvalidNodeId; }
+    [[nodiscard]] XGBOOST_DEVICE bool IsRoot() const { return parent_ == kInvalidNodeId; }
     /*!
      * \brief set the left child
      * \param nid node id to right child
@@ -337,15 +319,13 @@ class RegTree : public Model {
     this->ChangeToLeaf(rid, value);
   }
 
-  /*! \brief model parameter */
-  TreeParam param;
   RegTree() {
-    param.Init(Args{});
-    nodes_.resize(param.num_nodes);
-    stats_.resize(param.num_nodes);
-    split_types_.resize(param.num_nodes, FeatureType::kNumerical);
-    split_categories_segments_.resize(param.num_nodes);
-    for (int i = 0; i < param.num_nodes; i++) {
+    param_.Init(Args{});
+    nodes_.resize(param_.num_nodes);
+    stats_.resize(param_.num_nodes);
+    split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
+    split_categories_segments_.resize(param_.num_nodes);
+    for (int i = 0; i < param_.num_nodes; i++) {
       nodes_[i].SetLeaf(0.0f);
       nodes_[i].SetParent(kInvalidNodeId);
     }
@@ -354,10 +334,10 @@ class RegTree : public Model {
    * \brief Constructor that initializes the tree model with shape.
    */
   explicit RegTree(bst_target_t n_targets, bst_feature_t n_features) : RegTree{} {
-    param.num_feature = n_features;
-    param.size_leaf_vector = n_targets;
+    param_.num_feature = n_features;
+    param_.size_leaf_vector = n_targets;
     if (n_targets > 1) {
-      this->p_mt_tree_.reset(new MultiTargetTree{&param});
+      this->p_mt_tree_.reset(new MultiTargetTree{&param_});
     }
   }
 
@@ -401,7 +381,7 @@ class RegTree : public Model {
 
   bool operator==(const RegTree& b) const {
     return nodes_ == b.nodes_ && stats_ == b.stats_ &&
-           deleted_nodes_ == b.deleted_nodes_ && param == b.param;
+           deleted_nodes_ == b.deleted_nodes_ && param_ == b.param_;
   }
   /* \brief Iterate through all nodes in this tree.
    *
@@ -459,7 +439,9 @@ class RegTree : public Model {
                   bst_float loss_change, float sum_hess, float left_sum,
                   float right_sum,
                   bst_node_t leaf_right_child = kInvalidNodeId);
-
+  /**
+   * \brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
+   */
   void ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond, bool default_left,
                   linalg::VectorView<float const> base_weight,
                   linalg::VectorView<float const> left_weight,
@@ -485,19 +467,48 @@ class RegTree : public Model {
                          bst_float base_weight, bst_float left_leaf_weight,
                          bst_float right_leaf_weight, bst_float loss_change, float sum_hess,
                          float left_sum, float right_sum);
-
-  [[nodiscard]] bool HasCategoricalSplit() const {
-    return !split_categories_.empty();
-  }
+  /**
+   * \brief Whether this tree has categorical split.
+   */
+  [[nodiscard]] bool HasCategoricalSplit() const { return !split_categories_.empty(); }
   /**
    * \brief Whether this is a multi-target tree.
    */
   [[nodiscard]] bool IsMultiTarget() const { return static_cast<bool>(p_mt_tree_); }
-  [[nodiscard]] bst_target_t NumTargets() const { return param.size_leaf_vector; }
+  /**
+   * \brief The size of leaf weight.
+   */
+  [[nodiscard]] bst_target_t NumTargets() const { return param_.size_leaf_vector; }
+  /**
+   * \brief Get the underlying implementaiton of multi-target tree.
+   */
   [[nodiscard]] auto GetMultiTargetTree() const {
     CHECK(IsMultiTarget());
     return p_mt_tree_.get();
   }
+  /**
+   * \brief Get the number of features.
+   */
+  [[nodiscard]] bst_feature_t NumFeatures() const noexcept { return param_.num_feature; }
+  /**
+   * \brief Get the total number of nodes including deleted ones in this tree.
+   */
+  [[nodiscard]] bst_node_t NumNodes() const noexcept { return param_.num_nodes; }
+  /**
+   * \brief Get the total number of valid nodes in this tree.
+   */
+  [[nodiscard]] bst_node_t NumValidNodes() const noexcept {
+    return param_.num_nodes - param_.num_deleted;
+  }
+  /**
+   * \brief number of extra nodes besides the root
+   */
+  [[nodiscard]] bst_node_t NumExtraNodes() const noexcept {
+    return param_.num_nodes - 1 - param_.num_deleted;
+  }
+  /* \brief Count number of leaves in tree. */
+  [[nodiscard]] bst_node_t GetNumLeaves() const;
+  [[nodiscard]] bst_node_t GetNumSplitNodes() const;
 
   /*!
    * \brief get current depth
@@ -514,6 +525,9 @@ class RegTree : public Model {
     }
     return depth;
   }
+  /**
+   * \brief Set the leaf weight for a multi-target tree.
+   */
   void SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
     CHECK(IsMultiTarget());
     return this->p_mt_tree_->SetLeaf(nidx, weight);
@@ -525,25 +539,13 @@ class RegTree : public Model {
    */
   [[nodiscard]] int MaxDepth(int nid) const {
     if (nodes_[nid].IsLeaf()) return 0;
-    return std::max(MaxDepth(nodes_[nid].LeftChild())+1,
-                     MaxDepth(nodes_[nid].RightChild())+1);
+    return std::max(MaxDepth(nodes_[nid].LeftChild()) + 1, MaxDepth(nodes_[nid].RightChild()) + 1);
   }
 
   /*!
    * \brief get maximum depth
    */
-  int MaxDepth() {
-    return MaxDepth(0);
-  }
-
-  /*! \brief number of extra nodes besides the root */
-  [[nodiscard]] int NumExtraNodes() const {
-    return param.num_nodes - 1 - param.num_deleted;
-  }
-
-  /* \brief Count number of leaves in tree. */
-  [[nodiscard]] bst_node_t GetNumLeaves() const;
-  [[nodiscard]] bst_node_t GetNumSplitNodes() const;
+  int MaxDepth() { return MaxDepth(0); }
 
   /*!
    * \brief dense feature vector that can be taken by RegTree
@@ -735,6 +737,8 @@ class RegTree : public Model {
   template <bool typed>
   void LoadCategoricalSplit(Json const& in);
   void SaveCategoricalSplit(Json* p_out) const;
+  /*! \brief model parameter */
+  TreeParam param_;
   // vector of nodes
   std::vector<Node> nodes_;
   // free node space, used during training process
@@ -752,20 +756,20 @@ class RegTree : public Model {
   // allocate a new node,
   // !!!!!! NOTE: may cause BUG here, nodes.resize
   bst_node_t AllocNode() {
-    if (param.num_deleted != 0) {
+    if (param_.num_deleted != 0) {
       int nid = deleted_nodes_.back();
       deleted_nodes_.pop_back();
       nodes_[nid].Reuse();
-      --param.num_deleted;
+      --param_.num_deleted;
       return nid;
     }
-    int nd = param.num_nodes++;
-    CHECK_LT(param.num_nodes, std::numeric_limits<int>::max())
+    int nd = param_.num_nodes++;
+    CHECK_LT(param_.num_nodes, std::numeric_limits<int>::max())
         << "number of nodes in the tree exceed 2^31";
-    nodes_.resize(param.num_nodes);
-    stats_.resize(param.num_nodes);
-    split_types_.resize(param.num_nodes, FeatureType::kNumerical);
-    split_categories_segments_.resize(param.num_nodes);
+    nodes_.resize(param_.num_nodes);
+    stats_.resize(param_.num_nodes);
+    split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
+    split_categories_segments_.resize(param_.num_nodes);
     return nd;
   }
   // delete a tree node, keep the parent field to allow trace back
@@ -780,7 +784,7 @@ class RegTree : public Model {
 
     deleted_nodes_.push_back(nid);
     nodes_[nid].MarkDelete();
-    ++param.num_deleted;
+    ++param_.num_deleted;
   }
 };
 
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index c1cb825c1..16609619c 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -360,8 +360,8 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
           << "Set `process_type` to `update` if you want to update existing "
              "trees.";
       // create new tree
-      std::unique_ptr<RegTree> ptr(new RegTree());
-      ptr->param.UpdateAllowUnknown(this->cfg_);
+      std::unique_ptr<RegTree> ptr(new RegTree{this->model_.learner_model_param->LeafLength(),
+                                               this->model_.learner_model_param->num_feature});
       new_trees.push_back(ptr.get());
       ret->push_back(std::move(ptr));
     } else if (tparam_.process_type == TreeProcessType::kUpdate) {
diff --git a/src/learner.cc b/src/learner.cc
index 454855355..62875ead6 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -775,8 +775,6 @@ class LearnerConfiguration : public Learner {
     }
     CHECK_NE(mparam_.num_feature, 0)
         << "0 feature is supplied.  Are you using raw Booster interface?";
-    // Remove these once binary IO is gone.
-    cfg_["num_feature"] = common::ToString(mparam_.num_feature);
   }
 
   void ConfigureGBM(LearnerTrainParam const& old, Args const& args) {
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 4473173d2..a4b78fefd 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -275,7 +275,7 @@ float FillNodeMeanValues(RegTree const *tree, bst_node_t nidx, std::vector<float
 }
 
 void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
-  size_t num_nodes = tree->param.num_nodes;
+  size_t num_nodes = tree->NumNodes();
   if (mean_values->size() == num_nodes) {
     return;
   }
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 0891ec3b2..8f297f46d 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -815,9 +815,9 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
                          linalg::VectorView<float const> left_weight,
                          linalg::VectorView<float const> right_weight) {
   CHECK(IsMultiTarget());
-  CHECK_LT(split_index, this->param.num_feature);
+  CHECK_LT(split_index, this->param_.num_feature);
   CHECK(this->p_mt_tree_);
-  CHECK_GT(param.size_leaf_vector, 1);
+  CHECK_GT(param_.size_leaf_vector, 1);
 
   this->p_mt_tree_->Expand(nidx, split_index, split_cond, default_left, base_weight, left_weight,
                            right_weight);
@@ -826,7 +826,7 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
   split_categories_segments_.resize(this->Size());
   this->split_types_.at(nidx) = FeatureType::kNumerical;
 
-  this->param.num_nodes = this->p_mt_tree_->Size();
+  this->param_.num_nodes = this->p_mt_tree_->Size();
 }
 
 void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
@@ -850,13 +850,13 @@ void RegTree::ExpandCategorical(bst_node_t nid, bst_feature_t split_index,
 }
 
 void RegTree::Load(dmlc::Stream* fi) {
-  CHECK_EQ(fi->Read(&param, sizeof(TreeParam)), sizeof(TreeParam));
+  CHECK_EQ(fi->Read(&param_, sizeof(TreeParam)), sizeof(TreeParam));
   if (!DMLC_IO_NO_ENDIAN_SWAP) {
-    param = param.ByteSwap();
+    param_ = param_.ByteSwap();
   }
-  nodes_.resize(param.num_nodes);
-  stats_.resize(param.num_nodes);
-  CHECK_NE(param.num_nodes, 0);
+  nodes_.resize(param_.num_nodes);
+  stats_.resize(param_.num_nodes);
+  CHECK_NE(param_.num_nodes, 0);
   CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()),
            sizeof(Node) * nodes_.size());
   if (!DMLC_IO_NO_ENDIAN_SWAP) {
@@ -873,29 +873,29 @@ void RegTree::Load(dmlc::Stream* fi) {
   }
   // chg deleted nodes
   deleted_nodes_.resize(0);
-  for (int i = 1; i < param.num_nodes; ++i) {
+  for (int i = 1; i < param_.num_nodes; ++i) {
     if (nodes_[i].IsDeleted()) {
       deleted_nodes_.push_back(i);
     }
   }
-  CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param.num_deleted);
+  CHECK_EQ(static_cast<int>(deleted_nodes_.size()), param_.num_deleted);
 
-  split_types_.resize(param.num_nodes, FeatureType::kNumerical);
-  split_categories_segments_.resize(param.num_nodes);
+  split_types_.resize(param_.num_nodes, FeatureType::kNumerical);
+  split_categories_segments_.resize(param_.num_nodes);
 }
 
 void RegTree::Save(dmlc::Stream* fo) const {
-  CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
-  CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
-  CHECK_EQ(param.deprecated_num_roots, 1);
-  CHECK_NE(param.num_nodes, 0);
+  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
+  CHECK_EQ(param_.deprecated_num_roots, 1);
+  CHECK_NE(param_.num_nodes, 0);
   CHECK(!HasCategoricalSplit())
       << "Please use JSON/UBJSON for saving models with categorical splits.";
 
   if (DMLC_IO_NO_ENDIAN_SWAP) {
-    fo->Write(&param, sizeof(TreeParam));
+    fo->Write(&param_, sizeof(TreeParam));
   } else {
-    TreeParam x = param.ByteSwap();
+    TreeParam x = param_.ByteSwap();
     fo->Write(&x, sizeof(x));
   }
 
@@ -1081,7 +1081,7 @@ void RegTree::LoadModel(Json const& in) {
   bool typed = IsA<I32Array>(in[tf::kParent]);
   auto const& in_obj = get<Object const>(in);
   // basic properties
-  FromJson(in["tree_param"], &param);
+  FromJson(in["tree_param"], &param_);
   // categorical splits
   bool has_cat = in_obj.find("split_type") != in_obj.cend();
   if (has_cat) {
@@ -1092,55 +1092,55 @@ void RegTree::LoadModel(Json const& in) {
     }
   }
   // multi-target
-  if (param.size_leaf_vector > 1) {
-    this->p_mt_tree_.reset(new MultiTargetTree{&param});
+  if (param_.size_leaf_vector > 1) {
+    this->p_mt_tree_.reset(new MultiTargetTree{&param_});
     this->GetMultiTargetTree()->LoadModel(in);
     return;
   }
 
   bool feature_is_64 = IsA<I64Array>(in["split_indices"]);
   if (typed && feature_is_64) {
-    LoadModelImpl<true, true>(in, param, &stats_, &nodes_);
+    LoadModelImpl<true, true>(in, param_, &stats_, &nodes_);
   } else if (typed && !feature_is_64) {
-    LoadModelImpl<true, false>(in, param, &stats_, &nodes_);
+    LoadModelImpl<true, false>(in, param_, &stats_, &nodes_);
   } else if (!typed && feature_is_64) {
-    LoadModelImpl<false, true>(in, param, &stats_, &nodes_);
+    LoadModelImpl<false, true>(in, param_, &stats_, &nodes_);
   } else {
-    LoadModelImpl<false, false>(in, param, &stats_, &nodes_);
+    LoadModelImpl<false, false>(in, param_, &stats_, &nodes_);
   }
 
   if (!has_cat) {
-    this->split_categories_segments_.resize(this->param.num_nodes);
-    this->split_types_.resize(this->param.num_nodes);
+    this->split_categories_segments_.resize(this->param_.num_nodes);
+    this->split_types_.resize(this->param_.num_nodes);
     std::fill(split_types_.begin(), split_types_.end(), FeatureType::kNumerical);
   }
 
   deleted_nodes_.clear();
-  for (bst_node_t i = 1; i < param.num_nodes; ++i) {
+  for (bst_node_t i = 1; i < param_.num_nodes; ++i) {
     if (nodes_[i].IsDeleted()) {
       deleted_nodes_.push_back(i);
     }
   }
   // easier access to [] operator
   auto& self = *this;
-  for (auto nid = 1; nid < param.num_nodes; ++nid) {
+  for (auto nid = 1; nid < param_.num_nodes; ++nid) {
     auto parent = self[nid].Parent();
     CHECK_NE(parent, RegTree::kInvalidNodeId);
     self[nid].SetParent(self[nid].Parent(), self[parent].LeftChild() == nid);
   }
-  CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param.num_deleted);
-  CHECK_EQ(this->split_categories_segments_.size(), param.num_nodes);
+  CHECK_EQ(static_cast<bst_node_t>(deleted_nodes_.size()), param_.num_deleted);
+  CHECK_EQ(this->split_categories_segments_.size(), param_.num_nodes);
 }
 
 void RegTree::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   // basic properties
-  out["tree_param"] = ToJson(param);
+  out["tree_param"] = ToJson(param_);
   // categorical splits
   this->SaveCategoricalSplit(p_out);
   // multi-target
   if (this->IsMultiTarget()) {
-    CHECK_GT(param.size_leaf_vector, 1);
+    CHECK_GT(param_.size_leaf_vector, 1);
     this->GetMultiTargetTree()->SaveModel(p_out);
     return;
   }
@@ -1150,11 +1150,11 @@ void RegTree::SaveModel(Json* p_out) const {
    *  pruner, and this pruner can be used inside another updater so leaf are not necessary
    *  at the end of node array.
    */
-  CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
-  CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(nodes_.size()));
+  CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
 
-  CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param.num_nodes));
-  auto n_nodes = param.num_nodes;
+  CHECK_EQ(get<String>(out["tree_param"]["num_nodes"]), std::to_string(param_.num_nodes));
+  auto n_nodes = param_.num_nodes;
 
   // stats
   F32Array loss_changes(n_nodes);
@@ -1168,7 +1168,7 @@ void RegTree::SaveModel(Json* p_out) const {
 
   F32Array conds(n_nodes);
   U8Array default_left(n_nodes);
-  CHECK_EQ(this->split_types_.size(), param.num_nodes);
+  CHECK_EQ(this->split_types_.size(), param_.num_nodes);
 
   namespace tf = tree_field;
 
@@ -1189,7 +1189,7 @@ void RegTree::SaveModel(Json* p_out) const {
       default_left.Set(i, static_cast<uint8_t>(!!n.DefaultLeft()));
     }
   };
-  if (this->param.num_feature > static_cast<bst_feature_t>(std::numeric_limits<int32_t>::max())) {
+  if (this->param_.num_feature > static_cast<bst_feature_t>(std::numeric_limits<int32_t>::max())) {
     I64Array indices_64(n_nodes);
     save_tree(&indices_64);
     out[tf::kSplitIdx] = std::move(indices_64);
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 06579c429..02edfa74a 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -190,7 +190,7 @@ class ColMaker: public TreeUpdater {
         (*p_tree)[nid].SetLeaf(snode_[nid].weight * param_.learning_rate);
       }
       // remember auxiliary statistics in the tree node
-      for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+      for (int nid = 0; nid < p_tree->NumNodes(); ++nid) {
         p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg;
         p_tree->Stat(nid).base_weight = snode_[nid].weight;
         p_tree->Stat(nid).sum_hess = static_cast<float>(snode_[nid].stats.sum_hess);
@@ -255,9 +255,9 @@ class ColMaker: public TreeUpdater {
       {
         // setup statistics space for each tree node
         for (auto& i : stemp_) {
-          i.resize(tree.param.num_nodes, ThreadEntry());
+          i.resize(tree.NumNodes(), ThreadEntry());
         }
-        snode_.resize(tree.param.num_nodes, NodeEntry());
+        snode_.resize(tree.NumNodes(), NodeEntry());
       }
       const MetaInfo& info = fmat.Info();
       // setup position
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index 0970d2f79..29f9917ba 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -72,7 +72,7 @@ class TreePruner : public TreeUpdater {
   void DoPrune(TrainParam const* param, RegTree* p_tree) {
     auto& tree = *p_tree;
     bst_node_t npruned = 0;
-    for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
+    for (int nid = 0; nid < tree.NumNodes(); ++nid) {
       if (tree[nid].IsLeaf() && !tree[nid].IsDeleted()) {
         npruned = this->TryPruneLeaf(param, p_tree, nid, tree.GetDepth(nid), npruned);
       }
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 4bfe603e0..17c565490 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -50,11 +50,11 @@ class TreeRefresher : public TreeUpdater {
         int tid = omp_get_thread_num();
         int num_nodes = 0;
         for (auto tree : trees) {
-          num_nodes += tree->param.num_nodes;
+          num_nodes += tree->NumNodes();
         }
         stemp[tid].resize(num_nodes, GradStats());
         std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
-        fvec_temp[tid].Init(trees[0]->param.num_feature);
+        fvec_temp[tid].Init(trees[0]->NumFeatures());
       });
     }
     exc.Rethrow();
@@ -77,7 +77,7 @@ class TreeRefresher : public TreeUpdater {
           for (auto tree : trees) {
             AddStats(*tree, feats, gpair_h, info, ridx,
                      dmlc::BeginPtr(stemp[tid]) + offset);
-            offset += tree->param.num_nodes;
+            offset += tree->NumNodes();
           }
           feats.Drop(inst);
         });
@@ -96,7 +96,7 @@ class TreeRefresher : public TreeUpdater {
     int offset = 0;
     for (auto tree : trees) {
       this->Refresh(param, dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
-      offset += tree->param.num_nodes;
+      offset += tree->NumNodes();
     }
   }
 
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index aa6a18797..881de57e1 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -40,8 +40,7 @@ TEST(GrowHistMaker, InteractionConstraint)
   ObjInfo task{ObjInfo::kRegression};
   {
     // With constraints
-    RegTree tree;
-    tree.param.num_feature = kCols;
+    RegTree tree{1, kCols};
 
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
     TrainParam param;
@@ -58,8 +57,7 @@ TEST(GrowHistMaker, InteractionConstraint)
   }
   {
     // Without constraints
-    RegTree tree;
-    tree.param.num_feature = kCols;
+    RegTree tree{1u, kCols};
 
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -76,7 +74,7 @@ TEST(GrowHistMaker, InteractionConstraint)
 }
 
 namespace {
-void TestColumnSplit(int32_t rows, int32_t cols, RegTree const& expected_tree) {
+void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_tree) {
   auto p_dmat = GenerateDMatrix(rows, cols);
   auto p_gradients = GenerateGradients(rows);
   Context ctx;
@@ -87,8 +85,7 @@ void TestColumnSplit(int32_t rows, int32_t cols, RegTree const& expected_tree) {
   std::unique_ptr<DMatrix> sliced{
       p_dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
 
-  RegTree tree;
-  tree.param.num_feature = cols;
+  RegTree tree{1u, cols};
   TrainParam param;
   param.Init(Args{});
   updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
@@ -107,8 +104,7 @@ TEST(GrowHistMaker, ColumnSplit) {
   auto constexpr kRows = 32;
   auto constexpr kCols = 16;
 
-  RegTree expected_tree;
-  expected_tree.param.num_feature = kCols;
+  RegTree expected_tree{1u, kCols};
   ObjInfo task{ObjInfo::kRegression};
   {
     auto p_dmat = GenerateDMatrix(kRows, kCols);
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index 7d2bd9c7c..af83ed7eb 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -17,8 +17,8 @@ TEST(MultiTargetTree, JsonIO) {
   linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, Context::kCpuId};
   tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                   left_weight.HostView(), right_weight.HostView());
-  ASSERT_EQ(tree.param.num_nodes, 3);
-  ASSERT_EQ(tree.param.size_leaf_vector, 3);
+  ASSERT_EQ(tree.NumNodes(), 3);
+  ASSERT_EQ(tree.NumTargets(), 3);
   ASSERT_EQ(tree.GetMultiTargetTree()->Size(), 3);
   ASSERT_EQ(tree.Size(), 3);
 
@@ -26,20 +26,19 @@ TEST(MultiTargetTree, JsonIO) {
   tree.SaveModel(&jtree);
 
   auto check_jtree = [](Json jtree, RegTree const& tree) {
-    ASSERT_EQ(get<String const>(jtree["tree_param"]["num_nodes"]),
-              std::to_string(tree.param.num_nodes));
+    ASSERT_EQ(get<String const>(jtree["tree_param"]["num_nodes"]), std::to_string(tree.NumNodes()));
     ASSERT_EQ(get<F32Array const>(jtree["base_weights"]).size(),
-              tree.param.num_nodes * tree.param.size_leaf_vector);
-    ASSERT_EQ(get<I32Array const>(jtree["parents"]).size(), tree.param.num_nodes);
-    ASSERT_EQ(get<I32Array const>(jtree["left_children"]).size(), tree.param.num_nodes);
-    ASSERT_EQ(get<I32Array const>(jtree["right_children"]).size(), tree.param.num_nodes);
+              tree.NumNodes() * tree.NumTargets());
+    ASSERT_EQ(get<I32Array const>(jtree["parents"]).size(), tree.NumNodes());
+    ASSERT_EQ(get<I32Array const>(jtree["left_children"]).size(), tree.NumNodes());
+    ASSERT_EQ(get<I32Array const>(jtree["right_children"]).size(), tree.NumNodes());
   };
   check_jtree(jtree, tree);
 
   RegTree loaded;
   loaded.LoadModel(jtree);
   ASSERT_TRUE(loaded.IsMultiTarget());
-  ASSERT_EQ(loaded.param.num_nodes, 3);
+  ASSERT_EQ(loaded.NumNodes(), 3);
 
   Json jtree1{Object{}};
   loaded.SaveModel(&jtree1);
diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc
index 063816def..78161cac9 100644
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -32,8 +32,7 @@ TEST(Updater, Prune) {
   auto ctx = CreateEmptyGenericParam(GPUIDX);
 
   // prepare tree
-  RegTree tree = RegTree();
-  tree.param.UpdateAllowUnknown(cfg);
+  RegTree tree = RegTree{1u, kCols};
   std::vector<RegTree*> trees {&tree};
   // prepare pruner
   TrainParam param;
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index 80a0cbe6f..f46ec2880 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -28,9 +28,8 @@ TEST(Updater, Refresh) {
       {"num_feature", std::to_string(kCols)},
       {"reg_lambda", "1"}};
 
-  RegTree tree = RegTree();
+  RegTree tree = RegTree{1u, kCols};
   auto ctx = CreateEmptyGenericParam(GPUIDX);
-  tree.param.UpdateAllowUnknown(cfg);
   std::vector<RegTree*> trees{&tree};
 
   ObjInfo task{ObjInfo::kRegression};
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index 130a0ef70..44708ebd1 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -11,9 +11,8 @@
 namespace xgboost {
 TEST(Tree, ModelShape) {
   bst_feature_t n_features = std::numeric_limits<uint32_t>::max();
-  RegTree tree;
-  tree.param.UpdateAllowUnknown(Args{{"num_feature", std::to_string(n_features)}});
-  ASSERT_EQ(tree.param.num_feature, n_features);
+  RegTree tree{1u, n_features};
+  ASSERT_EQ(tree.NumFeatures(), n_features);
 
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/tree.model";
@@ -27,7 +26,7 @@ TEST(Tree, ModelShape) {
     RegTree new_tree;
     std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(tmp_file.c_str(), "r"));
     new_tree.Load(fi.get());
-    ASSERT_EQ(new_tree.param.num_feature, n_features);
+    ASSERT_EQ(new_tree.NumFeatures(), n_features);
   }
   {
     // json
@@ -39,7 +38,7 @@ TEST(Tree, ModelShape) {
 
     auto j_loaded = Json::Load(StringView{dumped.data(), dumped.size()});
     new_tree.LoadModel(j_loaded);
-    ASSERT_EQ(new_tree.param.num_feature, n_features);
+    ASSERT_EQ(new_tree.NumFeatures(), n_features);
   }
   {
     // ubjson
@@ -51,7 +50,7 @@ TEST(Tree, ModelShape) {
 
     auto j_loaded = Json::Load(StringView{dumped.data(), dumped.size()}, std::ios::binary);
     new_tree.LoadModel(j_loaded);
-    ASSERT_EQ(new_tree.param.num_feature, n_features);
+    ASSERT_EQ(new_tree.NumFeatures(), n_features);
   }
 }
 
@@ -488,8 +487,7 @@ TEST(Tree, JsonIO) {
 
   RegTree loaded_tree;
   loaded_tree.LoadModel(j_tree);
-  ASSERT_EQ(loaded_tree.param.num_nodes, 3);
-
+  ASSERT_EQ(loaded_tree.NumNodes(), 3);
   ASSERT_TRUE(loaded_tree == tree);
 
   auto left = tree[0].LeftChild();
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index a3f5cf9d3..07c51dfcc 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -37,8 +37,7 @@ class UpdaterTreeStatTest : public ::testing::Test {
                                            : CreateEmptyGenericParam(Context::kCpuId));
     auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
     up->Configure(Args{});
-    RegTree tree;
-    tree.param.num_feature = kCols;
+    RegTree tree{1u, kCols};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     up->Update(&param, &gpairs_, p_dmat_.get(), position, {&tree});
 
@@ -95,16 +94,14 @@ class UpdaterEtaTest : public ::testing::Test {
     param1.Init(Args{{"eta", "1.0"}});
 
     for (size_t iter = 0; iter < 4; ++iter) {
-      RegTree tree_0;
+      RegTree tree_0{1u, kCols};
       {
-        tree_0.param.num_feature = kCols;
         std::vector<HostDeviceVector<bst_node_t>> position(1);
         up_0->Update(&param0, &gpairs_, p_dmat_.get(), position, {&tree_0});
       }
 
-      RegTree tree_1;
+      RegTree tree_1{1u, kCols};
       {
-        tree_1.param.num_feature = kCols;
         std::vector<HostDeviceVector<bst_node_t>> position(1);
         up_1->Update(&param1, &gpairs_, p_dmat_.get(), position, {&tree_1});
       }

From 8be6095ece5bbb2059b70287e17a650d84aca39f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 13 Mar 2023 22:16:31 +0800
Subject: [PATCH 07/32] Implement NDCG cache. (#8893)

---
 src/common/ranking_utils.cc            |  98 +++++++++-
 src/common/ranking_utils.cu            | 207 +++++++++++++++++++++
 src/common/ranking_utils.cuh           |  40 +++++
 src/common/ranking_utils.h             | 238 ++++++++++++++++++++++++-
 tests/cpp/common/test_ranking_utils.cc | 119 ++++++++++++-
 tests/cpp/common/test_ranking_utils.cu |  98 ++++++++++
 tests/cpp/common/test_ranking_utils.h  |   9 +
 7 files changed, 798 insertions(+), 11 deletions(-)
 create mode 100644 src/common/ranking_utils.cu
 create mode 100644 src/common/ranking_utils.cuh
 create mode 100644 tests/cpp/common/test_ranking_utils.cu
 create mode 100644 tests/cpp/common/test_ranking_utils.h

diff --git a/src/common/ranking_utils.cc b/src/common/ranking_utils.cc
index 8fad9a206..c8069784b 100644
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@@ -6,9 +6,7 @@
 #include <algorithm>          // for copy_n, max, min, none_of, all_of
 #include <cstddef>            // for size_t
 #include <cstdio>             // for sscanf
-#include <exception>          // for exception
 #include <functional>         // for greater
-#include <iterator>           // for reverse_iterator
 #include <string>             // for char_traits, string
 
 #include "algorithm.h"        // for ArgSort
@@ -18,10 +16,102 @@
 #include "xgboost/base.h"     // for bst_group_t
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"     // for MetaInfo
-#include "xgboost/linalg.h"   // for All, TensorView, Range, Tensor, Vector
-#include "xgboost/logging.h"  // for Error, LogCheck_EQ, CHECK_EQ
+#include "xgboost/linalg.h"   // for All, TensorView, Range
+#include "xgboost/logging.h"  // for CHECK_EQ
 
 namespace xgboost::ltr {
+void RankingCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
+  if (info.group_ptr_.empty()) {
+    group_ptr_.Resize(2, 0);
+    group_ptr_.HostVector()[1] = info.num_row_;
+  } else {
+    group_ptr_.HostVector() = info.group_ptr_;
+  }
+
+  auto const& gptr = group_ptr_.ConstHostVector();
+  for (std::size_t i = 1; i < gptr.size(); ++i) {
+    std::size_t n = gptr[i] - gptr[i - 1];
+    max_group_size_ = std::max(max_group_size_, n);
+  }
+
+  double sum_weights = 0;
+  auto n_groups = Groups();
+  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
+  for (bst_omp_uint k = 0; k < n_groups; ++k) {
+    sum_weights += weight[k];
+  }
+  weight_norm_ = static_cast<double>(n_groups) / sum_weights;
+}
+
+common::Span<std::size_t const> RankingCache::MakeRankOnCPU(Context const* ctx,
+                                                            common::Span<float const> predt) {
+  auto gptr = this->DataGroupPtr(ctx);
+  auto rank = this->sorted_idx_cache_.HostSpan();
+  CHECK_EQ(rank.size(), predt.size());
+
+  common::ParallelFor(this->Groups(), ctx->Threads(), [&](auto g) {
+    auto cnt = gptr[g + 1] - gptr[g];
+    auto g_predt = predt.subspan(gptr[g], cnt);
+    auto g_rank = rank.subspan(gptr[g], cnt);
+    auto sorted_idx = common::ArgSort<std::size_t>(
+        ctx, g_predt.data(), g_predt.data() + g_predt.size(), std::greater<>{});
+    CHECK_EQ(g_rank.size(), sorted_idx.size());
+    std::copy_n(sorted_idx.data(), sorted_idx.size(), g_rank.data());
+  });
+
+  return rank;
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void RankingCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const*,
+                                                             common::Span<float const>) {
+  common::AssertGPUSupport();
+  return {};
+}
+#endif  // !defined()
+
+void NDCGCache::InitOnCPU(Context const* ctx, MetaInfo const& info) {
+  auto const h_group_ptr = this->DataGroupPtr(ctx);
+
+  discounts_.Resize(MaxGroupSize(), 0);
+  auto& h_discounts = discounts_.HostVector();
+  for (std::size_t i = 0; i < MaxGroupSize(); ++i) {
+    h_discounts[i] = CalcDCGDiscount(i);
+  }
+
+  auto n_groups = h_group_ptr.size() - 1;
+  auto h_labels = info.labels.HostView().Slice(linalg::All(), 0);
+
+  CheckNDCGLabels(this->Param(), h_labels,
+                  [](auto beg, auto end, auto op) { return std::none_of(beg, end, op); });
+
+  inv_idcg_.Reshape(n_groups);
+  auto h_inv_idcg = inv_idcg_.HostView();
+  std::size_t topk = this->Param().TopK();
+  auto const exp_gain = this->Param().ndcg_exp_gain;
+
+  common::ParallelFor(n_groups, ctx->Threads(), [&](auto g) {
+    auto g_labels = h_labels.Slice(linalg::Range(h_group_ptr[g], h_group_ptr[g + 1]));
+    auto sorted_idx = common::ArgSort<std::size_t>(ctx, linalg::cbegin(g_labels),
+                                                   linalg::cend(g_labels), std::greater<>{});
+
+    double idcg{0.0};
+    for (std::size_t i = 0; i < std::min(g_labels.Size(), topk); ++i) {
+      if (exp_gain) {
+        idcg += h_discounts[i] * CalcDCGGain(g_labels(sorted_idx[i]));
+      } else {
+        idcg += h_discounts[i] * g_labels(sorted_idx[i]);
+      }
+    }
+    h_inv_idcg(g) = CalcInvIDCG(idcg);
+  });
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 DMLC_REGISTER_PARAMETER(LambdaRankParam);
 
 std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
new file mode 100644
index 000000000..ce9cda4e2
--- /dev/null
+++ b/src/common/ranking_utils.cu
@@ -0,0 +1,207 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include <thrust/functional.h>                  // for maximum
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/logical.h>                     // for none_of, all_of
+#include <thrust/pair.h>                        // for pair, make_pair
+#include <thrust/reduce.h>                      // for reduce
+#include <thrust/scan.h>                        // for inclusive_scan
+
+#include <cstddef>                              // for size_t
+
+#include "algorithm.cuh"                        // for SegmentedArgSort
+#include "cuda_context.cuh"                     // for CUDAContext
+#include "device_helpers.cuh"                   // for MakeTransformIterator, LaunchN
+#include "optional_weight.h"                    // for MakeOptionalWeights, OptionalWeights
+#include "ranking_utils.cuh"                    // for ThreadsForMean
+#include "ranking_utils.h"
+#include "threading_utils.cuh"                  // for SegmentedTrapezoidThreads
+#include "xgboost/base.h"                       // for XGBOOST_DEVICE, bst_group_t
+#include "xgboost/context.h"                    // for Context
+#include "xgboost/linalg.h"                     // for VectorView, All, Range
+#include "xgboost/logging.h"                    // for CHECK
+#include "xgboost/span.h"                       // for Span
+
+namespace xgboost::ltr {
+namespace cuda_impl {
+void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
+                    common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
+                    common::Span<bst_group_t const> d_group_ptr, std::size_t k,
+                    linalg::VectorView<double> out_dcg) {
+  CHECK_EQ(d_group_ptr.size() - 1, out_dcg.Size());
+  using IdxGroup = thrust::pair<std::size_t, std::size_t>;
+  auto group_it = dh::MakeTransformIterator<IdxGroup>(
+      thrust::make_counting_iterator(0ull), [=] XGBOOST_DEVICE(std::size_t idx) {
+        return thrust::make_pair(idx, dh::SegmentId(d_group_ptr, idx));  // NOLINT
+      });
+  auto value_it = dh::MakeTransformIterator<double>(
+      group_it,
+      [exp_gain, d_labels, d_group_ptr, k,
+       d_sorted_idx] XGBOOST_DEVICE(IdxGroup const& l) -> double {
+        auto g_begin = d_group_ptr[l.second];
+        auto g_size = d_group_ptr[l.second + 1] - g_begin;
+
+        auto idx_in_group = l.first - g_begin;
+        if (idx_in_group >= k) {
+          return 0.0;
+        }
+        double gain{0.0};
+        auto g_sorted_idx = d_sorted_idx.subspan(g_begin, g_size);
+        auto g_labels = d_labels.Slice(linalg::Range(g_begin, g_begin + g_size));
+
+        if (exp_gain) {
+          gain = ltr::CalcDCGGain(g_labels(g_sorted_idx[idx_in_group]));
+        } else {
+          gain = g_labels(g_sorted_idx[idx_in_group]);
+        }
+        double discount = CalcDCGDiscount(idx_in_group);
+        return gain * discount;
+      });
+
+  CHECK(out_dcg.Contiguous());
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
+                                  d_group_ptr.size() - 1, d_group_ptr.data(),
+                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
+                                  d_group_ptr.size() - 1, d_group_ptr.data(),
+                                  d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+}
+
+void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,
+                        common::Span<bst_group_t const> d_group_ptr,
+                        linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const& p) {
+  CHECK_GE(d_group_ptr.size(), 2ul);
+  size_t n_groups = d_group_ptr.size() - 1;
+  CHECK_EQ(out_inv_IDCG.Size(), n_groups);
+  dh::device_vector<std::size_t> sorted_idx(d_labels.Size());
+  auto d_sorted_idx = dh::ToSpan(sorted_idx);
+  common::SegmentedArgSort<false, true>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
+  CalcQueriesDCG(ctx, d_labels, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(), out_inv_IDCG);
+  dh::LaunchN(out_inv_IDCG.Size(), ctx->CUDACtx()->Stream(),
+              [out_inv_IDCG] XGBOOST_DEVICE(size_t idx) mutable {
+                double idcg = out_inv_IDCG(idx);
+                out_inv_IDCG(idx) = CalcInvIDCG(idcg);
+              });
+}
+}  // namespace cuda_impl
+
+namespace {
+struct CheckNDCGOp {
+  CUDAContext const* cuctx;
+  template <typename It, typename Op>
+  bool operator()(It beg, It end, Op op) {
+    return thrust::none_of(cuctx->CTP(), beg, end, op);
+  }
+};
+struct CheckMAPOp {
+  CUDAContext const* cuctx;
+  template <typename It, typename Op>
+  bool operator()(It beg, It end, Op op) {
+    return thrust::all_of(cuctx->CTP(), beg, end, op);
+  }
+};
+
+struct ThreadGroupOp {
+  common::Span<bst_group_t const> d_group_ptr;
+  std::size_t n_pairs;
+
+  common::Span<std::size_t> out_thread_group_ptr;
+
+  XGBOOST_DEVICE void operator()(std::size_t i) {
+    out_thread_group_ptr[i + 1] =
+        cuda_impl::ThreadsForMean(d_group_ptr[i + 1] - d_group_ptr[i], n_pairs);
+  }
+};
+
+struct GroupSizeOp {
+  common::Span<bst_group_t const> d_group_ptr;
+
+  XGBOOST_DEVICE auto operator()(std::size_t i) -> std::size_t {
+    return d_group_ptr[i + 1] - d_group_ptr[i];
+  }
+};
+
+struct WeightOp {
+  common::OptionalWeights d_weight;
+  XGBOOST_DEVICE auto operator()(std::size_t i) -> double { return d_weight[i]; }
+};
+}  // anonymous namespace
+
+void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  CUDAContext const* cuctx = ctx->CUDACtx();
+
+  group_ptr_.SetDevice(ctx->gpu_id);
+  if (info.group_ptr_.empty()) {
+    group_ptr_.Resize(2, 0);
+    group_ptr_.HostVector()[1] = info.num_row_;
+  } else {
+    auto const& h_group_ptr = info.group_ptr_;
+    group_ptr_.Resize(h_group_ptr.size());
+    auto d_group_ptr = group_ptr_.DeviceSpan();
+    dh::safe_cuda(cudaMemcpyAsync(d_group_ptr.data(), h_group_ptr.data(), d_group_ptr.size_bytes(),
+                                  cudaMemcpyHostToDevice, cuctx->Stream()));
+  }
+
+  auto d_group_ptr = DataGroupPtr(ctx);
+  std::size_t n_groups = Groups();
+
+  auto it = dh::MakeTransformIterator<std::size_t>(thrust::make_counting_iterator(0ul),
+                                                   GroupSizeOp{d_group_ptr});
+  max_group_size_ =
+      thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
+
+  threads_group_ptr_.SetDevice(ctx->gpu_id);
+  threads_group_ptr_.Resize(n_groups + 1, 0);
+  auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
+  if (param_.HasTruncation()) {
+    n_cuda_threads_ =
+        common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
+  } else {
+    auto n_pairs = Param().NumPair();
+    dh::LaunchN(n_groups, cuctx->Stream(),
+                ThreadGroupOp{d_group_ptr, n_pairs, d_threads_group_ptr});
+    thrust::inclusive_scan(cuctx->CTP(), dh::tcbegin(d_threads_group_ptr),
+                           dh::tcend(d_threads_group_ptr), dh::tbegin(d_threads_group_ptr));
+    n_cuda_threads_ = info.num_row_ * param_.NumPair();
+  }
+
+  sorted_idx_cache_.SetDevice(ctx->gpu_id);
+  sorted_idx_cache_.Resize(info.labels.Size(), 0);
+
+  auto weight = common::MakeOptionalWeights(ctx, info.weights_);
+  auto w_it =
+      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), WeightOp{weight});
+  weight_norm_ = static_cast<double>(n_groups) / thrust::reduce(w_it, w_it + n_groups);
+}
+
+common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
+                                                             common::Span<float const> predt) {
+  auto d_sorted_idx = sorted_idx_cache_.DeviceSpan();
+  auto d_group_ptr = DataGroupPtr(ctx);
+  common::SegmentedArgSort<false, true>(ctx, predt, d_group_ptr, d_sorted_idx);
+  return d_sorted_idx;
+}
+
+void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  CUDAContext const* cuctx = ctx->CUDACtx();
+  auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
+
+  auto d_group_ptr = this->DataGroupPtr(ctx);
+
+  std::size_t n_groups = d_group_ptr.size() - 1;
+  inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
+  auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
+  cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
+  CHECK_GE(this->Param().NumPair(), 1ul);
+
+  discounts_.SetDevice(ctx->gpu_id);
+  discounts_.Resize(MaxGroupSize());
+  auto d_discount = discounts_.DeviceSpan();
+  dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
+              [=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
+}
+}  // namespace xgboost::ltr
diff --git a/src/common/ranking_utils.cuh b/src/common/ranking_utils.cuh
new file mode 100644
index 000000000..297f5157e
--- /dev/null
+++ b/src/common/ranking_utils.cuh
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_RANKING_UTILS_CUH_
+#define XGBOOST_COMMON_RANKING_UTILS_CUH_
+
+#include <cstddef>            // for size_t
+
+#include "ranking_utils.h"    // for LambdaRankParam
+#include "xgboost/base.h"     // for bst_group_t, XGBOOST_DEVICE
+#include "xgboost/context.h"  // for Context
+#include "xgboost/linalg.h"   // for VectorView
+#include "xgboost/span.h"     // for Span
+
+namespace xgboost {
+namespace ltr {
+namespace cuda_impl {
+void CalcQueriesDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
+                    common::Span<std::size_t const> d_sorted_idx, bool exp_gain,
+                    common::Span<bst_group_t const> d_group_ptr, std::size_t k,
+                    linalg::VectorView<double> out_dcg);
+
+void CalcQueriesInvIDCG(Context const *ctx, linalg::VectorView<float const> d_labels,
+                        common::Span<bst_group_t const> d_group_ptr,
+                        linalg::VectorView<double> out_inv_IDCG, ltr::LambdaRankParam const &p);
+
+// Functions for creating number of threads for CUDA, and getting back the number of pairs
+// from the number of threads.
+XGBOOST_DEVICE __forceinline__ std::size_t ThreadsForMean(std::size_t group_size,
+                                                          std::size_t n_pairs) {
+  return group_size * n_pairs;
+}
+XGBOOST_DEVICE __forceinline__ std::size_t PairsForGroup(std::size_t n_threads,
+                                                         std::size_t group_size) {
+  return n_threads / group_size;
+}
+}  // namespace cuda_impl
+}  // namespace ltr
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_RANKING_UTILS_CUH_
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index 631de4d70..88283fba2 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -11,7 +11,6 @@
 #include <string>                        // for char_traits, string
 #include <vector>                        // for vector
 
-#include "./math.h"                      // for CloseTo
 #include "dmlc/parameter.h"              // for FieldEntry, DMLC_DECLARE_FIELD
 #include "error_msg.h"                   // for GroupWeight, GroupSize
 #include "xgboost/base.h"                // for XGBOOST_DEVICE, bst_group_t
@@ -19,7 +18,7 @@
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/linalg.h"              // for Vector, VectorView, Tensor
-#include "xgboost/logging.h"             // for LogCheck_EQ, CHECK_EQ, CHECK
+#include "xgboost/logging.h"             // for CHECK_EQ, CHECK
 #include "xgboost/parameter.h"           // for XGBoostParameter
 #include "xgboost/span.h"                // for Span
 #include "xgboost/string_view.h"         // for StringView
@@ -34,6 +33,25 @@ using rel_degree_t = std::uint32_t;  // NOLINT
  */
 using position_t = std::uint32_t;  // NOLINT
 
+/**
+ * \brief Maximum relevance degree for NDCG
+ */
+constexpr std::size_t MaxRel() { return sizeof(rel_degree_t) * 8 - 1; }
+static_assert(MaxRel() == 31);
+
+XGBOOST_DEVICE inline double CalcDCGGain(rel_degree_t label) {
+  return static_cast<double>((1u << label) - 1);
+}
+
+XGBOOST_DEVICE inline double CalcDCGDiscount(std::size_t idx) {
+  return 1.0 / std::log2(static_cast<double>(idx) + 2.0);
+}
+
+XGBOOST_DEVICE inline double CalcInvIDCG(double idcg) {
+  auto inv_idcg = (idcg == 0.0 ? 0.0 : (1.0 / idcg));  // handle irrelevant document
+  return inv_idcg;
+}
+
 enum class PairMethod : std::int32_t {
   kTopK = 0,
   kMean = 1,
@@ -115,7 +133,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
         .describe("Number of pairs for each sample in the list.");
     DMLC_DECLARE_FIELD(lambdarank_unbiased)
         .set_default(false)
-        .describe("Unbiased lambda mart. Use IPW to debias click position");
+        .describe("Unbiased lambda mart. Use extended IPW to debias click position");
     DMLC_DECLARE_FIELD(lambdarank_bias_norm)
         .set_default(2.0)
         .set_lower_bound(0.0)
@@ -126,6 +144,220 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
   }
 };
 
+/**
+ * \brief Common cached items for ranking tasks.
+ */
+class RankingCache {
+ private:
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+  // Cached parameter
+  LambdaRankParam param_;
+  // offset to data groups.
+  HostDeviceVector<bst_group_t> group_ptr_;
+  // store the sorted index of prediction.
+  HostDeviceVector<std::size_t> sorted_idx_cache_;
+  // Maximum size of group
+  std::size_t max_group_size_{0};
+  // Normalization for weight
+  double weight_norm_{1.0};
+  /**
+   * CUDA cache
+   */
+  // offset to threads assigned to each group for gradient calculation
+  HostDeviceVector<std::size_t> threads_group_ptr_;
+  // Sorted index of label for finding buckets.
+  HostDeviceVector<std::size_t> y_sorted_idx_cache_;
+  // Cached labels sorted by the model
+  HostDeviceVector<float> y_ranked_by_model_;
+  // store rounding factor for objective for each group
+  linalg::Vector<GradientPair> roundings_;
+  // rounding factor for cost
+  HostDeviceVector<double> cost_rounding_;
+  // temporary storage for creating rounding factors. Stored as byte to avoid having cuda
+  // data structure in here.
+  HostDeviceVector<std::uint8_t> max_lambdas_;
+  // total number of cuda threads used for gradient calculation
+  std::size_t n_cuda_threads_{0};
+
+  // Create model rank list on GPU
+  common::Span<std::size_t const> MakeRankOnCUDA(Context const* ctx,
+                                                 common::Span<float const> predt);
+  // Create model rank list on CPU
+  common::Span<std::size_t const> MakeRankOnCPU(Context const* ctx,
+                                                common::Span<float const> predt);
+
+ protected:
+  [[nodiscard]] std::size_t MaxGroupSize() const { return max_group_size_; }
+
+ public:
+  RankingCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p) : param_{p} {
+    CHECK(param_.GetInitialised());
+    if (!info.group_ptr_.empty()) {
+      CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
+          << error::GroupSize() << "the size of label.";
+    }
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+    if (!info.weights_.Empty()) {
+      CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
+    }
+  }
+  [[nodiscard]] std::size_t MaxPositionSize() const {
+    // Use truncation level as bound.
+    if (param_.HasTruncation()) {
+      return param_.NumPair();
+    }
+    // Hardcoded maximum size of positions to track. We don't need too many of them as the
+    // bias decreases exponentially.
+    return std::min(max_group_size_, static_cast<std::size_t>(32));
+  }
+  // Constructed as [1, n_samples] if group ptr is not supplied by the user
+  common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
+    group_ptr_.SetDevice(ctx->gpu_id);
+    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
+  }
+
+  [[nodiscard]] auto const& Param() const { return param_; }
+  [[nodiscard]] std::size_t Groups() const { return group_ptr_.Size() - 1; }
+  [[nodiscard]] double WeightNorm() const { return weight_norm_; }
+
+  // Create a rank list by model prediction
+  common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
+    if (sorted_idx_cache_.Empty()) {
+      sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      sorted_idx_cache_.Resize(predt.size());
+    }
+    if (ctx->IsCPU()) {
+      return this->MakeRankOnCPU(ctx, predt);
+    } else {
+      return this->MakeRankOnCUDA(ctx, predt);
+    }
+  }
+  // The function simply returns a uninitialized buffer as this is only used by the
+  // objective for creating pairs.
+  common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
+    CHECK(ctx->IsCUDA());
+    if (y_sorted_idx_cache_.Empty()) {
+      y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      y_sorted_idx_cache_.Resize(n_samples);
+    }
+    return y_sorted_idx_cache_.DeviceSpan();
+  }
+  common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
+    CHECK(ctx->IsCUDA());
+    if (y_ranked_by_model_.Empty()) {
+      y_ranked_by_model_.SetDevice(ctx->gpu_id);
+      y_ranked_by_model_.Resize(n_samples);
+    }
+    return y_ranked_by_model_.DeviceSpan();
+  }
+
+  // CUDA cache getters, the cache is shared between metric and objective, some of these
+  // fields are lazy initialized to avoid unnecessary allocation.
+  [[nodiscard]] common::Span<std::size_t const> CUDAThreadsGroupPtr() const {
+    CHECK(!threads_group_ptr_.Empty());
+    return threads_group_ptr_.ConstDeviceSpan();
+  }
+  [[nodiscard]] std::size_t CUDAThreads() const { return n_cuda_threads_; }
+
+  linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
+    if (roundings_.Size() == 0) {
+      roundings_.SetDevice(ctx->gpu_id);
+      roundings_.Reshape(Groups());
+    }
+    return roundings_.View(ctx->gpu_id);
+  }
+  common::Span<double> CUDACostRounding(Context const* ctx) {
+    if (cost_rounding_.Size() == 0) {
+      cost_rounding_.SetDevice(ctx->gpu_id);
+      cost_rounding_.Resize(1);
+    }
+    return cost_rounding_.DeviceSpan();
+  }
+  template <typename Type>
+  common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
+    max_lambdas_.SetDevice(ctx->gpu_id);
+    std::size_t bytes = n * sizeof(Type);
+    if (bytes != max_lambdas_.Size()) {
+      max_lambdas_.Resize(bytes);
+    }
+    return common::Span<Type>{reinterpret_cast<Type*>(max_lambdas_.DevicePointer()), n};
+  }
+};
+
+class NDCGCache : public RankingCache {
+  // NDCG discount
+  HostDeviceVector<double> discounts_;
+  // 1.0 / IDCG
+  linalg::Vector<double> inv_idcg_;
+  /**
+   * CUDA cache
+   */
+  // store the intermediate DCG calculation result for metric
+  linalg::Vector<double> dcg_;
+
+ public:
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+
+ public:
+  NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
+      : RankingCache{ctx, info, p} {
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+  }
+
+  linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
+    return inv_idcg_.View(ctx->gpu_id);
+  }
+  common::Span<double const> Discount(Context const* ctx) const {
+    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
+  }
+  linalg::VectorView<double> Dcg(Context const* ctx) {
+    if (dcg_.Size() == 0) {
+      dcg_.SetDevice(ctx->gpu_id);
+      dcg_.Reshape(this->Groups());
+    }
+    return dcg_.View(ctx->gpu_id);
+  }
+};
+
+/**
+ * \brief Validate label for NDCG
+ *
+ * \tparam NoneOf Implementation of std::none_of. Specified as a parameter to reuse the
+ *                check for both CPU and GPU.
+ */
+template <typename NoneOf>
+void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float const> labels,
+                     NoneOf none_of) {
+  auto d_labels = labels.Values();
+  if (p.ndcg_exp_gain) {
+    auto label_is_integer =
+        none_of(d_labels.data(), d_labels.data() + d_labels.size(), [] XGBOOST_DEVICE(float v) {
+          auto l = std::floor(v);
+          return std::fabs(l - v) > kRtEps || v < 0.0f;
+        });
+    CHECK(label_is_integer)
+        << "When using relevance degree as target, label must be either 0 or positive integer.";
+  }
+
+  if (p.ndcg_exp_gain) {
+    auto label_is_valid = none_of(d_labels.data(), d_labels.data() + d_labels.size(),
+                                  [] XGBOOST_DEVICE(ltr::rel_degree_t v) { return v > MaxRel(); });
+    CHECK(label_is_valid) << "Relevance degress must be lesser than or equal to " << MaxRel()
+                          << " when the exponential NDCG gain function is used. "
+                          << "Set `ndcg_exp_gain` to false to use custom DCG gain.";
+  }
+}
+
 /**
  * \brief Parse name for ranking metric given parameters.
  *
diff --git a/tests/cpp/common/test_ranking_utils.cc b/tests/cpp/common/test_ranking_utils.cc
index c73cffed7..9240db0d4 100644
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -1,16 +1,25 @@
 /**
  * Copyright 2023 by XGBoost Contributors
  */
-#include <gtest/gtest.h>                        // for Test, AssertionResult, Message, TestPartR...
-#include <gtest/gtest.h>                        // for ASSERT_NEAR, ASSERT_T...
-#include <xgboost/base.h>                       // for Args
+#include "test_ranking_utils.h"
+
+#include <gtest/gtest.h>
+#include <xgboost/base.h>                       // for Args, bst_group_t, kRtEps
 #include <xgboost/context.h>                    // for Context
+#include <xgboost/data.h>                       // for MetaInfo, DMatrix
+#include <xgboost/host_device_vector.h>         // for HostDeviceVector
+#include <xgboost/logging.h>                    // for Error
 #include <xgboost/string_view.h>                // for StringView
 
+#include <cstddef>                              // for size_t
 #include <cstdint>                              // for uint32_t
-#include <utility>                              // for pair
+#include <numeric>                              // for iota
+#include <utility>                              // for move
+#include <vector>                               // for vector
 
+#include "../../../src/common/numeric.h"        // for Iota
 #include "../../../src/common/ranking_utils.h"  // for LambdaRankParam, ParseMetricName, MakeMet...
+#include "../helpers.h"                         // for EmptyDMatrix
 
 namespace xgboost::ltr {
 TEST(RankingUtils, LambdaRankParam) {
@@ -66,4 +75,106 @@ TEST(RankingUtils, MakeMetricName) {
   name = MakeMetricName("map", 2, false);
   ASSERT_EQ(name, "map@2");
 }
+
+void TestRankingCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+
+  info.num_row_ = 16;
+  info.labels.Reshape(info.num_row_);
+  auto& h_label = info.labels.Data()->HostVector();
+  for (std::size_t i = 0; i < h_label.size(); ++i) {
+    h_label[i] = i % 2;
+  }
+
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  RankingCache cache{ctx, info, param};
+
+  HostDeviceVector<float> predt(info.num_row_, 0);
+  auto& h_predt = predt.HostVector();
+  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
+  predt.SetDevice(ctx->gpu_id);
+
+  auto rank_idx =
+      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
+
+  for (std::size_t i = 0; i < rank_idx.size(); ++i) {
+    ASSERT_EQ(rank_idx[i], rank_idx.size() - i - 1);
+  }
+}
+
+TEST(RankingCache, InitFromCPU) {
+  Context ctx;
+  TestRankingCache(&ctx);
+}
+
+void TestNDCGCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  {
+    // empty
+    NDCGCache cache{ctx, info, param};
+    ASSERT_EQ(cache.DataGroupPtr(ctx).size(), 2);
+  }
+
+  info.num_row_ = 3;
+  info.group_ptr_ = {static_cast<bst_group_t>(0), static_cast<bst_group_t>(info.num_row_)};
+
+  {
+    auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
+    // empty label
+    ASSERT_THROW(fail(), dmlc::Error);
+    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
+    // invalid label
+    ASSERT_THROW(fail(), dmlc::Error);
+    auto h_labels = info.labels.HostView();
+    for (std::size_t i = 0; i < h_labels.Size(); ++i) {
+      h_labels(i) *= 10;
+    }
+    param.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
+    NDCGCache cache{ctx, info, param};
+    Context cpuctx;
+    auto inv_idcg = cache.InvIDCG(&cpuctx);
+    ASSERT_EQ(inv_idcg.Size(), 1);
+    ASSERT_NEAR(1.0 / inv_idcg(0), 2.63093, kRtEps);
+  }
+
+  {
+    param.UpdateAllowUnknown(Args{{"lambdarank_unbiased", "false"}});
+
+    std::vector<float> h_data(32);
+
+    common::Iota(ctx, h_data.begin(), h_data.end(), 0.0f);
+    info.labels.Reshape(h_data.size());
+    info.num_row_ = h_data.size();
+    info.group_ptr_.back() = info.num_row_;
+    info.labels.Data()->HostVector() = std::move(h_data);
+
+    {
+      NDCGCache cache{ctx, info, param};
+      Context cpuctx;
+      auto inv_idcg = cache.InvIDCG(&cpuctx);
+      ASSERT_NEAR(inv_idcg(0), 0.00551782, kRtEps);
+    }
+
+    param.UpdateAllowUnknown(
+        Args{{"lambdarank_num_pair_per_sample", "3"}, {"lambdarank_pair_method", "topk"}});
+    {
+      NDCGCache cache{ctx, info, param};
+      Context cpuctx;
+      auto inv_idcg = cache.InvIDCG(&cpuctx);
+      ASSERT_NEAR(inv_idcg(0), 0.01552123, kRtEps);
+    }
+  }
+}
+
+TEST(NDCGCache, InitFromCPU) {
+  Context ctx;
+  TestNDCGCache(&ctx);
+}
 }  // namespace xgboost::ltr
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
new file mode 100644
index 000000000..5fda42c72
--- /dev/null
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>                          // for Args, XGBOOST_DEVICE, bst_group_t, kRtEps
+#include <xgboost/context.h>                       // for Context
+#include <xgboost/linalg.h>                        // for MakeTensorView, Vector
+
+#include <cstddef>                                 // for size_t
+#include <memory>                                  // for shared_ptr
+#include <numeric>                                 // for iota
+#include <vector>                                  // for vector
+
+#include "../../../src/common/algorithm.cuh"       // for SegmentedSequence
+#include "../../../src/common/cuda_context.cuh"    // for CUDAContext
+#include "../../../src/common/device_helpers.cuh"  // for device_vector, ToSpan
+#include "../../../src/common/ranking_utils.cuh"   // for CalcQueriesInvIDCG
+#include "../../../src/common/ranking_utils.h"     // for LambdaRankParam, RankingCache
+#include "../helpers.h"                            // for EmptyDMatrix
+#include "test_ranking_utils.h"                    // for TestNDCGCache
+#include "xgboost/data.h"                          // for MetaInfo
+#include "xgboost/host_device_vector.h"            // for HostDeviceVector
+
+namespace xgboost::ltr {
+void TestCalcQueriesInvIDCG() {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  std::size_t n_groups = 5, n_samples_per_group = 32;
+
+  dh::device_vector<float> scores(n_samples_per_group * n_groups);
+  dh::device_vector<bst_group_t> group_ptr(n_groups + 1);
+  auto d_group_ptr = dh::ToSpan(group_ptr);
+  dh::LaunchN(d_group_ptr.size(), ctx.CUDACtx()->Stream(),
+              [=] XGBOOST_DEVICE(std::size_t i) { d_group_ptr[i] = i * n_samples_per_group; });
+
+  auto d_scores = dh::ToSpan(scores);
+  common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
+
+  linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
+
+  ltr::LambdaRankParam p;
+  p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
+
+  cuda_impl::CalcQueriesInvIDCG(&ctx, linalg::MakeTensorView(&ctx, d_scores, d_scores.size()),
+                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.gpu_id), p);
+  for (std::size_t i = 0; i < n_groups; ++i) {
+    double inv_idcg = inv_IDCG(i);
+    ASSERT_NEAR(inv_idcg, 0.00551782, kRtEps);
+  }
+}
+
+TEST(RankingUtils, CalcQueriesInvIDCG) { TestCalcQueriesInvIDCG(); }
+
+namespace {
+void TestRankingCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+
+  info.num_row_ = 16;
+  info.labels.Reshape(info.num_row_);
+  auto& h_label = info.labels.Data()->HostVector();
+  for (std::size_t i = 0; i < h_label.size(); ++i) {
+    h_label[i] = i % 2;
+  }
+
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  RankingCache cache{ctx, info, param};
+
+  HostDeviceVector<float> predt(info.num_row_, 0);
+  auto& h_predt = predt.HostVector();
+  std::iota(h_predt.begin(), h_predt.end(), 0.0f);
+  predt.SetDevice(ctx->gpu_id);
+
+  auto rank_idx =
+      cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
+
+  std::vector<std::size_t> h_rank_idx(rank_idx.size());
+  dh::CopyDeviceSpanToVector(&h_rank_idx, rank_idx);
+  for (std::size_t i = 0; i < rank_idx.size(); ++i) {
+    ASSERT_EQ(h_rank_idx[i], h_rank_idx.size() - i - 1);
+  }
+}
+}  // namespace
+
+TEST(RankingCache, InitFromGPU) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestRankingCache(&ctx);
+}
+
+TEST(NDCGCache, InitFromGPU) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestNDCGCache(&ctx);
+}
+}  // namespace xgboost::ltr
diff --git a/tests/cpp/common/test_ranking_utils.h b/tests/cpp/common/test_ranking_utils.h
new file mode 100644
index 000000000..ede687ff4
--- /dev/null
+++ b/tests/cpp/common/test_ranking_utils.h
@@ -0,0 +1,9 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#pragma once
+#include <xgboost/context.h>  // for Context
+
+namespace xgboost::ltr {
+void TestNDCGCache(Context const* ctx);
+}  // namespace xgboost::ltr

From c400fa1e8d8399ec867e0a18ffec7cd7df00de83 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 14 Mar 2023 19:07:10 +0800
Subject: [PATCH 08/32] Predictor for vector leaf. (#8898)

---
 src/predictor/cpu_predictor.cc            | 341 +++++++++++++---------
 src/predictor/predict_fn.h                |  30 +-
 tests/cpp/helpers.cc                      |  57 ++--
 tests/cpp/helpers.h                       |  17 +-
 tests/cpp/predictor/test_cpu_predictor.cc |   6 +
 tests/cpp/predictor/test_predictor.cc     | 131 ++++++++-
 tests/cpp/predictor/test_predictor.h      |  13 +-
 7 files changed, 410 insertions(+), 185 deletions(-)

diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index a4b78fefd..0c045dda0 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -1,52 +1,64 @@
 /**
  * Copyright 2017-2023 by XGBoost Contributors
  */
-#include <dmlc/omp.h>
+#include <algorithm>  // for max, fill, min
+#include <any>        // for any, any_cast
+#include <cassert>    // for assert
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t, int32_t, uint64_t
+#include <memory>     // for unique_ptr, shared_ptr
+#include <ostream>    // for char_traits, operator<<, basic_ostream
+#include <typeinfo>   // for type_info
+#include <vector>     // for vector
 
-#include <any>  // for any, any_cast
-#include <cstddef>
-#include <limits>
-#include <mutex>
+#include "../collective/communicator-inl.h"   // for Allreduce, IsDistributed
+#include "../collective/communicator.h"       // for Operation
+#include "../common/bitfield.h"               // for RBitField8
+#include "../common/categorical.h"            // for IsCat, Decision
+#include "../common/common.h"                 // for DivRoundUp
+#include "../common/math.h"                   // for CheckNAN
+#include "../common/threading_utils.h"        // for ParallelFor
+#include "../data/adapter.h"                  // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
+#include "../data/gradient_index.h"           // for GHistIndexMatrix
+#include "../data/proxy_dmatrix.h"            // for DMatrixProxy
+#include "../gbm/gbtree_model.h"              // for GBTreeModel, GBTreeModelParam
+#include "cpu_treeshap.h"                     // for CalculateContributions
+#include "dmlc/registry.h"                    // for DMLC_REGISTRY_FILE_TAG
+#include "predict_fn.h"                       // for GetNextNode, GetNextNodeMulti
+#include "xgboost/base.h"                     // for bst_float, bst_node_t, bst_omp_uint, bst_fe...
+#include "xgboost/context.h"                  // for Context
+#include "xgboost/data.h"                     // for Entry, DMatrix, MetaInfo, SparsePage, Batch...
+#include "xgboost/host_device_vector.h"       // for HostDeviceVector
+#include "xgboost/learner.h"                  // for LearnerModelParam
+#include "xgboost/linalg.h"                   // for TensorView, All, VectorView, Tensor
+#include "xgboost/logging.h"                  // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_NE
+#include "xgboost/multi_target_tree_model.h"  // for MultiTargetTree
+#include "xgboost/predictor.h"                // for PredictionCacheEntry, Predictor, PredictorReg
+#include "xgboost/span.h"                     // for Span
+#include "xgboost/tree_model.h"               // for RegTree, MTNotImplemented, RTreeNodeStat
 
-#include "../collective/communicator-inl.h"
-#include "../common/categorical.h"
-#include "../common/math.h"
-#include "../common/threading_utils.h"
-#include "../data/adapter.h"
-#include "../data/gradient_index.h"
-#include "../gbm/gbtree_model.h"
-#include "cpu_treeshap.h"  // CalculateContributions
-#include "predict_fn.h"
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/logging.h"
-#include "xgboost/predictor.h"
-#include "xgboost/tree_model.h"
-
-namespace xgboost {
-namespace predictor {
+namespace xgboost::predictor {
 
 DMLC_REGISTRY_FILE_TAG(cpu_predictor);
 
+namespace scalar {
 template <bool has_missing, bool has_categorical>
 bst_node_t GetLeafIndex(RegTree const &tree, const RegTree::FVec &feat,
-                        RegTree::CategoricalSplitMatrix const& cats) {
-  bst_node_t nid = 0;
-  while (!tree[nid].IsLeaf()) {
-    unsigned split_index = tree[nid].SplitIndex();
+                        RegTree::CategoricalSplitMatrix const &cats) {
+  bst_node_t nidx{0};
+  while (!tree[nidx].IsLeaf()) {
+    bst_feature_t split_index = tree[nidx].SplitIndex();
     auto fvalue = feat.GetFvalue(split_index);
-    nid = GetNextNode<has_missing, has_categorical>(
-        tree[nid], nid, fvalue, has_missing && feat.IsMissing(split_index), cats);
+    nidx = GetNextNode<has_missing, has_categorical>(
+        tree[nidx], nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
   }
-  return nid;
+  return nidx;
 }
 
 bst_float PredValue(const SparsePage::Inst &inst,
                     const std::vector<std::unique_ptr<RegTree>> &trees,
-                    const std::vector<int> &tree_info, int bst_group,
-                    RegTree::FVec *p_feats, unsigned tree_begin,
-                    unsigned tree_end) {
+                    const std::vector<int> &tree_info, std::int32_t bst_group,
+                    RegTree::FVec *p_feats, std::uint32_t tree_begin, std::uint32_t tree_end) {
   bst_float psum = 0.0f;
   p_feats->Fill(inst);
   for (size_t i = tree_begin; i < tree_end; ++i) {
@@ -68,40 +80,92 @@ bst_float PredValue(const SparsePage::Inst &inst,
 }
 
 template <bool has_categorical>
-bst_float
-PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
-                   RegTree::CategoricalSplitMatrix const& cats) {
-  const bst_node_t leaf = p_feats.HasMissing() ?
-    GetLeafIndex<true, has_categorical>(tree, p_feats, cats) :
-    GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
+bst_float PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
+                             RegTree::CategoricalSplitMatrix const &cats) {
+  const bst_node_t leaf = p_feats.HasMissing()
+                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
+                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
   return tree[leaf].LeafValue();
 }
 
 void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
-                       const size_t tree_end, std::vector<bst_float> *out_preds,
-                       const size_t predict_offset, const size_t num_group,
-                       const std::vector<RegTree::FVec> &thread_temp,
-                       const size_t offset, const size_t block_size) {
-  std::vector<bst_float> &preds = *out_preds;
+                       const size_t tree_end, const size_t predict_offset,
+                       const std::vector<RegTree::FVec> &thread_temp, const size_t offset,
+                       const size_t block_size, linalg::TensorView<float, 2> out_predt) {
   for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
     const size_t gid = model.tree_info[tree_id];
     auto const &tree = *model.trees[tree_id];
-    auto const& cats = tree.GetCategoriesMatrix();
+    auto const &cats = tree.GetCategoriesMatrix();
     auto has_categorical = tree.HasCategoricalSplit();
 
     if (has_categorical) {
-      for (size_t i = 0; i < block_size; ++i) {
-        preds[(predict_offset + i) * num_group + gid] +=
+      for (std::size_t i = 0; i < block_size; ++i) {
+        out_predt(predict_offset + i, gid) +=
             PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
       }
     } else {
-      for (size_t i = 0; i < block_size; ++i) {
-        preds[(predict_offset + i) * num_group + gid] +=
-            PredValueByOneTree<false>(thread_temp[offset + i], tree, cats);
+      for (std::size_t i = 0; i < block_size; ++i) {
+        out_predt(predict_offset + i, gid) +=
+            PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
       }
     }
   }
 }
+}  // namespace scalar
+
+namespace multi {
+template <bool has_missing, bool has_categorical>
+bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
+                        RegTree::CategoricalSplitMatrix const &cats) {
+  bst_node_t nidx{0};
+  while (!tree.IsLeaf(nidx)) {
+    unsigned split_index = tree.SplitIndex(nidx);
+    auto fvalue = feat.GetFvalue(split_index);
+    nidx = GetNextNodeMulti<has_missing, has_categorical>(
+        tree, nidx, fvalue, has_missing && feat.IsMissing(split_index), cats);
+  }
+  return nidx;
+}
+
+template <bool has_categorical>
+void PredValueByOneTree(const RegTree::FVec &p_feats, MultiTargetTree const &tree,
+                        RegTree::CategoricalSplitMatrix const &cats,
+                        linalg::VectorView<float> out_predt) {
+  bst_node_t const leaf = p_feats.HasMissing()
+                              ? GetLeafIndex<true, has_categorical>(tree, p_feats, cats)
+                              : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
+  auto leaf_value = tree.LeafValue(leaf);
+  assert(out_predt.Shape(0) == leaf_value.Shape(0) && "shape mismatch.");
+  for (size_t i = 0; i < leaf_value.Size(); ++i) {
+    out_predt(i) += leaf_value(i);
+  }
+}
+
+void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
+                       const size_t tree_end, const size_t predict_offset,
+                       const std::vector<RegTree::FVec> &thread_temp, const size_t offset,
+                       const size_t block_size, linalg::TensorView<float, 2> out_predt) {
+  for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
+    auto const &tree = *model.trees.at(tree_id);
+    auto cats = tree.GetCategoriesMatrix();
+    bool has_categorical = tree.HasCategoricalSplit();
+
+    if (has_categorical) {
+      for (std::size_t i = 0; i < block_size; ++i) {
+        auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+        PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
+                                 t_predts);
+      }
+    } else {
+      for (std::size_t i = 0; i < block_size; ++i) {
+        auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+        PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
+                                  t_predts);
+      }
+    }
+  }
+}
+}  // namespace multi
 
 template <typename DataView>
 void FVecFill(const size_t block_size, const size_t batch_offset, const int num_feature,
@@ -127,7 +191,7 @@ void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batc
 }
 
 namespace {
-static size_t constexpr kUnroll = 8;
+static std::size_t constexpr kUnroll = 8;
 }  // anonymous namespace
 
 struct SparsePageView {
@@ -227,15 +291,13 @@ class AdapterView {
 };
 
 template <typename DataView, size_t block_of_rows_size>
-void PredictBatchByBlockOfRowsKernel(
-    DataView batch, std::vector<bst_float> *out_preds,
-    gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end,
-    std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads) {
+void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &model,
+                                     int32_t tree_begin, int32_t tree_end,
+                                     std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads,
+                                     linalg::TensorView<float, 2> out_predt) {
   auto &thread_temp = *p_thread_temp;
-  int32_t const num_group = model.learner_model_param->num_output_group;
 
-  CHECK_EQ(model.param.size_leaf_vector, 0)
-      << "size_leaf_vector is enforced to 0 so far";
+  CHECK_EQ(model.param.size_leaf_vector, 0) << "size_leaf_vector is enforced to 0 so far";
   // parallel over local batch
   const auto nsize = static_cast<bst_omp_uint>(batch.Size());
   const int num_feature = model.learner_model_param->num_feature;
@@ -243,16 +305,19 @@ void PredictBatchByBlockOfRowsKernel(
 
   common::ParallelFor(n_blocks, n_threads, [&](bst_omp_uint block_id) {
     const size_t batch_offset = block_id * block_of_rows_size;
-    const size_t block_size =
-        std::min(nsize - batch_offset, block_of_rows_size);
+    const size_t block_size = std::min(nsize - batch_offset, block_of_rows_size);
     const size_t fvec_offset = omp_get_thread_num() * block_of_rows_size;
 
-    FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset,
-             p_thread_temp);
+    FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, p_thread_temp);
     // process block of rows through all trees to keep cache locality
-    PredictByAllTrees(model, tree_begin, tree_end, out_preds,
-                      batch_offset + batch.base_rowid, num_group, thread_temp,
-                      fvec_offset, block_size);
+    if (model.learner_model_param->IsVectorLeaf()) {
+      multi::PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid,
+                               thread_temp, fvec_offset, block_size, out_predt);
+    } else {
+      scalar::PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid,
+                                thread_temp, fvec_offset, block_size, out_predt);
+    }
+
     FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
   });
 }
@@ -557,33 +622,6 @@ class ColumnSplitHelper {
 
 class CPUPredictor : public Predictor {
  protected:
-  void PredictGHistIndex(DMatrix *p_fmat, gbm::GBTreeModel const &model, int32_t tree_begin,
-                         int32_t tree_end, std::vector<bst_float> *out_preds) const {
-    auto const n_threads = this->ctx_->Threads();
-
-    constexpr double kDensityThresh = .5;
-    size_t total =
-        std::max(p_fmat->Info().num_row_ * p_fmat->Info().num_col_, static_cast<uint64_t>(1));
-    double density = static_cast<double>(p_fmat->Info().num_nonzero_) / static_cast<double>(total);
-    bool blocked = density > kDensityThresh;
-
-    std::vector<RegTree::FVec> feat_vecs;
-    InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
-    std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
-    auto ft = p_fmat->Info().feature_types.ConstHostVector();
-    for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
-      if (blocked) {
-        PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
-            GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads},
-            out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
-      } else {
-        PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
-            GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads},
-            out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
-      }
-    }
-  }
-
   void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
                       gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end) const {
     if (p_fmat->IsColumnSplit()) {
@@ -592,11 +630,6 @@ class CPUPredictor : public Predictor {
       return;
     }
 
-    if (!p_fmat->PageExists<SparsePage>()) {
-      this->PredictGHistIndex(p_fmat, model, tree_begin, tree_end, out_preds);
-      return;
-    }
-
     auto const n_threads = this->ctx_->Threads();
     constexpr double kDensityThresh = .5;
     size_t total =
@@ -606,16 +639,38 @@ class CPUPredictor : public Predictor {
 
     std::vector<RegTree::FVec> feat_vecs;
     InitThreadTemp(n_threads * (blocked ? kBlockOfRowsSize : 1), &feat_vecs);
-    for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
-      CHECK_EQ(out_preds->size(),
-               p_fmat->Info().num_row_ * model.learner_model_param->num_output_group);
-      if (blocked) {
-        PredictBatchByBlockOfRowsKernel<SparsePageView, kBlockOfRowsSize>(
-            SparsePageView{&batch}, out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
 
-      } else {
-        PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(
-            SparsePageView{&batch}, out_preds, model, tree_begin, tree_end, &feat_vecs, n_threads);
+    std::size_t n_samples = p_fmat->Info().num_row_;
+    std::size_t n_groups = model.learner_model_param->OutputLength();
+    CHECK_EQ(out_preds->size(), n_samples * n_groups);
+    linalg::TensorView<float, 2> out_predt{*out_preds, {n_samples, n_groups}, ctx_->gpu_id};
+
+    if (!p_fmat->PageExists<SparsePage>()) {
+      std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
+      auto ft = p_fmat->Info().feature_types.ConstHostVector();
+      for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
+        if (blocked) {
+          PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
+              GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
+              tree_begin, tree_end, &feat_vecs, n_threads, out_predt);
+        } else {
+          PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, 1>(
+              GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
+              tree_begin, tree_end, &feat_vecs, n_threads, out_predt);
+        }
+      }
+    } else {
+      for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
+        if (blocked) {
+          PredictBatchByBlockOfRowsKernel<SparsePageView, kBlockOfRowsSize>(
+              SparsePageView{&batch}, model, tree_begin, tree_end, &feat_vecs, n_threads,
+              out_predt);
+
+        } else {
+          PredictBatchByBlockOfRowsKernel<SparsePageView, 1>(SparsePageView{&batch}, model,
+                                                             tree_begin, tree_end, &feat_vecs,
+                                                             n_threads, out_predt);
+        }
       }
     }
   }
@@ -623,17 +678,15 @@ class CPUPredictor : public Predictor {
  public:
   explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}
 
-  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
-                    const gbm::GBTreeModel &model, uint32_t tree_begin,
-                    uint32_t tree_end = 0) const override {
-    auto* out_preds = &predts->predictions;
+  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts, const gbm::GBTreeModel &model,
+                    uint32_t tree_begin, uint32_t tree_end = 0) const override {
+    auto *out_preds = &predts->predictions;
     // This is actually already handled in gbm, but large amount of tests rely on the
     // behaviour.
     if (tree_end == 0) {
       tree_end = model.trees.size();
     }
-    this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin,
-                         tree_end);
+    this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin, tree_end);
   }
 
   template <typename Adapter, size_t kBlockSize>
@@ -653,13 +706,16 @@ class CPUPredictor : public Predictor {
       info.num_row_ = m->NumRows();
       this->InitOutPredictions(info, &(out_preds->predictions), model);
     }
+
     std::vector<Entry> workspace(m->NumColumns() * kUnroll * n_threads);
     auto &predictions = out_preds->predictions.HostVector();
     std::vector<RegTree::FVec> thread_temp;
     InitThreadTemp(n_threads * kBlockSize, &thread_temp);
+    std::size_t n_groups = model.learner_model_param->OutputLength();
+    linalg::TensorView<float, 2> out_predt{predictions, {m->NumRows(), n_groups}, Context::kCpuId};
     PredictBatchByBlockOfRowsKernel<AdapterView<Adapter>, kBlockSize>(
-        AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads),
-        &predictions, model, tree_begin, tree_end, &thread_temp, n_threads);
+        AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads), model,
+        tree_begin, tree_end, &thread_temp, n_threads, out_predt);
   }
 
   bool InplacePredict(std::shared_ptr<DMatrix> p_m, const gbm::GBTreeModel &model, float missing,
@@ -689,6 +745,7 @@ class CPUPredictor : public Predictor {
   void PredictInstance(const SparsePage::Inst& inst,
                        std::vector<bst_float>* out_preds,
                        const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
+    CHECK(!model.learner_model_param->IsVectorLeaf()) << "predict instance" << MTNotImplemented();
     std::vector<RegTree::FVec> feat_vecs;
     feat_vecs.resize(1, RegTree::FVec());
     feat_vecs[0].Init(model.learner_model_param->num_feature);
@@ -701,31 +758,30 @@ class CPUPredictor : public Predictor {
     auto base_score = model.learner_model_param->BaseScore(ctx_)(0);
     // loop over output groups
     for (uint32_t gid = 0; gid < model.learner_model_param->num_output_group; ++gid) {
-      (*out_preds)[gid] =
-          PredValue(inst, model.trees, model.tree_info, gid, &feat_vecs[0], 0, ntree_limit) +
-          base_score;
+      (*out_preds)[gid] = scalar::PredValue(inst, model.trees, model.tree_info, gid, &feat_vecs[0],
+                                            0, ntree_limit) +
+                          base_score;
     }
   }
 
-  void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_preds,
-                   const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
+  void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *out_preds,
+                   const gbm::GBTreeModel &model, unsigned ntree_limit) const override {
     auto const n_threads = this->ctx_->Threads();
     std::vector<RegTree::FVec> feat_vecs;
     const int num_feature = model.learner_model_param->num_feature;
     InitThreadTemp(n_threads, &feat_vecs);
-    const MetaInfo& info = p_fmat->Info();
+    const MetaInfo &info = p_fmat->Info();
     // number of valid trees
     if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
       ntree_limit = static_cast<unsigned>(model.trees.size());
     }
-    std::vector<bst_float>& preds = out_preds->HostVector();
+    std::vector<bst_float> &preds = out_preds->HostVector();
     preds.resize(info.num_row_ * ntree_limit);
     // start collecting the prediction
     for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
       // parallel over local batch
       auto page = batch.GetView();
-      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
-      common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
+      common::ParallelFor(page.Size(), n_threads, [&](auto i) {
         const int tid = omp_get_thread_num();
         auto ridx = static_cast<size_t>(batch.base_rowid + i);
         RegTree::FVec &feats = feat_vecs[tid];
@@ -733,23 +789,28 @@ class CPUPredictor : public Predictor {
           feats.Init(num_feature);
         }
         feats.Fill(page[i]);
-        for (unsigned j = 0; j < ntree_limit; ++j) {
-          auto const& tree = *model.trees[j];
-          auto const& cats = tree.GetCategoriesMatrix();
-          bst_node_t tid = GetLeafIndex<true, true>(tree, feats, cats);
-          preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
+        for (std::uint32_t j = 0; j < ntree_limit; ++j) {
+          auto const &tree = *model.trees[j];
+          auto const &cats = tree.GetCategoriesMatrix();
+          bst_node_t nidx;
+          if (tree.IsMultiTarget()) {
+            nidx = multi::GetLeafIndex<true, true>(*tree.GetMultiTargetTree(), feats, cats);
+          } else {
+            nidx = scalar::GetLeafIndex<true, true>(tree, feats, cats);
+          }
+          preds[ridx * ntree_limit + j] = static_cast<bst_float>(nidx);
         }
         feats.Drop(page[i]);
       });
     }
   }
 
-  void PredictContribution(DMatrix *p_fmat,
-                           HostDeviceVector<float> *out_contribs,
+  void PredictContribution(DMatrix *p_fmat, HostDeviceVector<float> *out_contribs,
                            const gbm::GBTreeModel &model, uint32_t ntree_limit,
-                           std::vector<bst_float> const *tree_weights,
-                           bool approximate, int condition,
-                           unsigned condition_feature) const override {
+                           std::vector<bst_float> const *tree_weights, bool approximate,
+                           int condition, unsigned condition_feature) const override {
+    CHECK(!model.learner_model_param->IsVectorLeaf())
+        << "Predict contribution" << MTNotImplemented();
     auto const n_threads = this->ctx_->Threads();
     const int num_feature = model.learner_model_param->num_feature;
     std::vector<RegTree::FVec> feat_vecs;
@@ -825,11 +886,12 @@ class CPUPredictor : public Predictor {
     }
   }
 
-  void PredictInteractionContributions(
-      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
-      const gbm::GBTreeModel &model, unsigned ntree_limit,
-      std::vector<bst_float> const *tree_weights,
-      bool approximate) const override {
+  void PredictInteractionContributions(DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
+                                       const gbm::GBTreeModel &model, unsigned ntree_limit,
+                                       std::vector<bst_float> const *tree_weights,
+                                       bool approximate) const override {
+    CHECK(!model.learner_model_param->IsVectorLeaf())
+        << "Predict interaction contribution" << MTNotImplemented();
     const MetaInfo& info = p_fmat->Info();
     const int ngroup = model.learner_model_param->num_output_group;
     size_t const ncolumns = model.learner_model_param->num_feature;
@@ -884,5 +946,4 @@ class CPUPredictor : public Predictor {
 XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
     .describe("Make predictions using CPU.")
     .set_body([](Context const *ctx) { return new CPUPredictor(ctx); });
-}  // namespace predictor
-}  // namespace xgboost
+}  // namespace xgboost::predictor
diff --git a/src/predictor/predict_fn.h b/src/predictor/predict_fn.h
index 5d0c175fc..dbaf4a75e 100644
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -1,13 +1,12 @@
-/*!
- * Copyright 2021 by XGBoost Contributors
+/**
+ * Copyright 2021-2023 by XGBoost Contributors
  */
 #ifndef XGBOOST_PREDICTOR_PREDICT_FN_H_
 #define XGBOOST_PREDICTOR_PREDICT_FN_H_
 #include "../common/categorical.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace predictor {
+namespace xgboost::predictor {
 template <bool has_missing, bool has_categorical>
 inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid,
                                              float fvalue, bool is_missing,
@@ -24,6 +23,25 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
     }
   }
 }
-}      // namespace predictor
-}      // namespace xgboost
+
+template <bool has_missing, bool has_categorical>
+inline XGBOOST_DEVICE bst_node_t GetNextNodeMulti(MultiTargetTree const &tree,
+                                                  bst_node_t const nidx, float fvalue,
+                                                  bool is_missing,
+                                                  RegTree::CategoricalSplitMatrix const &cats) {
+  if (has_missing && is_missing) {
+    return tree.DefaultChild(nidx);
+  } else {
+    if (has_categorical && common::IsCat(cats.split_type, nidx)) {
+      auto node_categories =
+          cats.categories.subspan(cats.node_ptr[nidx].beg, cats.node_ptr[nidx].size);
+      return common::Decision(node_categories, fvalue) ? tree.LeftChild(nidx)
+                                                       : tree.RightChild(nidx);
+    } else {
+      return tree.LeftChild(nidx) + !(fvalue < tree.SplitCond(nidx));
+    }
+  }
+}
+
+}  // namespace xgboost::predictor
 #endif  // XGBOOST_PREDICTOR_PREDICT_FN_H_
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index ebb56d2d3..9236f569f 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -224,19 +224,18 @@ std::string RandomDataGenerator::GenerateArrayInterface(
   return out;
 }
 
-std::pair<std::vector<std::string>, std::string>
-RandomDataGenerator::GenerateArrayInterfaceBatch(
-    HostDeviceVector<float> *storage, size_t batches) const {
-  this->GenerateDense(storage);
+std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
+    HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
+    std::size_t batches, std::int32_t device) {
   std::vector<std::string> result(batches);
   std::vector<Json> objects;
 
-  size_t const rows_per_batch = rows_ / batches;
+  size_t const rows_per_batch = n_samples / batches;
 
-  auto make_interface = [storage, this](size_t offset, size_t rows) {
+  auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
     Json array_interface{Object()};
     array_interface["data"] = std::vector<Json>(2);
-    if (device_ >= 0) {
+    if (device >= 0) {
       array_interface["data"][0] =
           Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
       array_interface["stream"] = Null{};
@@ -249,22 +248,22 @@ RandomDataGenerator::GenerateArrayInterfaceBatch(
 
     array_interface["shape"] = std::vector<Json>(2);
     array_interface["shape"][0] = rows;
-    array_interface["shape"][1] = cols_;
+    array_interface["shape"][1] = n_features;
 
     array_interface["typestr"] = String("<f4");
     array_interface["version"] = 3;
     return array_interface;
   };
 
-  auto j_interface = make_interface(0, rows_);
+  auto j_interface = make_interface(0, n_samples);
   size_t offset = 0;
   for (size_t i = 0; i < batches - 1; ++i) {
     objects.emplace_back(make_interface(offset, rows_per_batch));
-    offset += rows_per_batch * cols_;
+    offset += rows_per_batch * n_features;
   }
 
-  size_t const remaining = rows_ - offset / cols_;
-  CHECK_LE(offset, rows_ * cols_);
+  size_t const remaining = n_samples - offset / n_features;
+  CHECK_LE(offset, n_samples * n_features);
   objects.emplace_back(make_interface(offset, remaining));
 
   for (size_t i = 0; i < batches; ++i) {
@@ -276,6 +275,12 @@ RandomDataGenerator::GenerateArrayInterfaceBatch(
   return {result, interface_str};
 }
 
+std::pair<std::vector<std::string>, std::string> RandomDataGenerator::GenerateArrayInterfaceBatch(
+    HostDeviceVector<float>* storage, size_t batches) const {
+  this->GenerateDense(storage);
+  return MakeArrayInterfaceBatch(storage, rows_, cols_, batches, device_);
+}
+
 std::string RandomDataGenerator::GenerateColumnarArrayInterface(
     std::vector<HostDeviceVector<float>> *data) const {
   CHECK(data);
@@ -400,11 +405,14 @@ int NumpyArrayIterForTest::Next() {
   return 1;
 }
 
-std::shared_ptr<DMatrix>
-GetDMatrixFromData(const std::vector<float> &x, int num_rows, int num_columns){
+std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
+                                            bst_feature_t num_columns) {
   data::DenseAdapter adapter(x.data(), num_rows, num_columns);
-  return std::shared_ptr<DMatrix>(new data::SimpleDMatrix(
-      &adapter, std::numeric_limits<float>::quiet_NaN(), 1));
+  auto p_fmat = std::shared_ptr<DMatrix>(
+      new data::SimpleDMatrix(&adapter, std::numeric_limits<float>::quiet_NaN(), 1));
+  CHECK_EQ(p_fmat->Info().num_row_, num_rows);
+  CHECK_EQ(p_fmat->Info().num_col_, num_columns);
+  return p_fmat;
 }
 
 std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features,
@@ -572,12 +580,23 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
   return gbm;
 }
 
-ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols,
-                                   size_t batches) : rows_{rows}, cols_{cols}, n_batches_{batches} {
+ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches)
+    : rows_{rows}, cols_{cols}, n_batches_{batches} {
   XGProxyDMatrixCreate(&proxy_);
   rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
+  std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
+}
+
+ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
+                                   std::size_t n_samples, bst_feature_t n_features,
+                                   std::size_t n_batches)
+    : rows_{n_samples}, cols_{n_features}, n_batches_{n_batches} {
+  XGProxyDMatrixCreate(&proxy_);
+  this->data_.Resize(data.Size());
+  CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
+  this->data_.Copy(data);
   std::tie(batches_, interface_) =
-      rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
+      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id);
 }
 
 ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index ec0abf32b..279e3f759 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -188,7 +188,7 @@ class SimpleRealUniformDistribution {
 };
 
 template <typename T>
-Json GetArrayInterface(HostDeviceVector<T> *storage, size_t rows, size_t cols) {
+Json GetArrayInterface(HostDeviceVector<T> const* storage, size_t rows, size_t cols) {
   Json array_interface{Object()};
   array_interface["data"] = std::vector<Json>(2);
   if (storage->DeviceCanRead()) {
@@ -318,8 +318,8 @@ GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
   return x;
 }
 
-std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
-                                            int num_rows, int num_columns);
+std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::size_t num_rows,
+                                            bst_feature_t num_columns);
 
 /**
  * \brief Create Sparse Page using data iterator.
@@ -394,7 +394,7 @@ typedef void *DMatrixHandle;  // NOLINT(*);
 class ArrayIterForTest {
  protected:
   HostDeviceVector<float> data_;
-  size_t iter_ {0};
+  size_t iter_{0};
   DMatrixHandle proxy_;
   std::unique_ptr<RandomDataGenerator> rng_;
 
@@ -418,6 +418,11 @@ class ArrayIterForTest {
   auto Proxy() -> decltype(proxy_) { return proxy_; }
 
   explicit ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches);
+  /**
+   * \brief Create iterator with user provided data.
+   */
+  explicit ArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
+                            std::size_t n_samples, bst_feature_t n_features, std::size_t n_batches);
   virtual ~ArrayIterForTest();
 };
 
@@ -433,6 +438,10 @@ class NumpyArrayIterForTest : public ArrayIterForTest {
  public:
   explicit NumpyArrayIterForTest(float sparsity, size_t rows = Rows(), size_t cols = Cols(),
                                  size_t batches = Batches());
+  explicit NumpyArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
+                                 std::size_t n_samples, bst_feature_t n_features,
+                                 std::size_t n_batches)
+      : ArrayIterForTest{ctx, data, n_samples, n_features, n_batches} {}
   int Next() override;
   ~NumpyArrayIterForTest() override = default;
 };
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 9a0ebee18..401d33c4d 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -305,4 +305,10 @@ TEST(CpuPredictor, Sparse) {
   TestSparsePrediction(0.2, "cpu_predictor");
   TestSparsePrediction(0.8, "cpu_predictor");
 }
+
+TEST(CpuPredictor, Multi) {
+  Context ctx;
+  ctx.nthread = 1;
+  TestVectorLeafPrediction(&ctx);
+}
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 3e8a94c75..4570a010d 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -1,28 +1,34 @@
-/*!
- * Copyright 2020-2021 by Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  */
-
 #include "test_predictor.h"
 
 #include <gtest/gtest.h>
-#include <xgboost/context.h>
-#include <xgboost/data.h>
-#include <xgboost/host_device_vector.h>
-#include <xgboost/predictor.h>
+#include <xgboost/context.h>                      // for Context
+#include <xgboost/data.h>                         // for DMatrix, BatchIterator, BatchSet, MetaInfo
+#include <xgboost/host_device_vector.h>           // for HostDeviceVector
+#include <xgboost/predictor.h>                    // for PredictionCacheEntry, Predictor, Predic...
 
-#include "../../../src/common/bitfield.h"
-#include "../../../src/common/categorical.h"
-#include "../../../src/common/io.h"
-#include "../../../src/data/adapter.h"
-#include "../../../src/data/proxy_dmatrix.h"
-#include "../helpers.h"
+#include <algorithm>                              // for max
+#include <limits>                                 // for numeric_limits
+#include <unordered_map>                          // for unordered_map
+
+#include "../../../src/common/bitfield.h"         // for LBitField32
+#include "../../../src/data/iterative_dmatrix.h"  // for IterativeDMatrix
+#include "../../../src/data/proxy_dmatrix.h"      // for DMatrixProxy
+#include "../helpers.h"                           // for GetDMatrixFromData, RandomDataGenerator
+#include "xgboost/json.h"                         // for Json, Object, get, String
+#include "xgboost/linalg.h"                       // for MakeVec, Tensor, TensorView, Vector
+#include "xgboost/logging.h"                      // for CHECK
+#include "xgboost/span.h"                         // for operator!=, SpanIterator, Span
+#include "xgboost/tree_model.h"                   // for RegTree
 
 namespace xgboost {
 TEST(Predictor, PredictionCache) {
   size_t constexpr kRows = 16, kCols = 4;
 
   PredictionContainer container;
-  DMatrix* m;
+  DMatrix *m;
   // Add a cache that is immediately expired.
   auto add_cache = [&]() {
     auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
@@ -412,4 +418,101 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
     }
   }
 }
+
+void TestVectorLeafPrediction(Context const *ctx) {
+  std::unique_ptr<Predictor> cpu_predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", ctx));
+
+  size_t constexpr kRows = 5;
+  size_t constexpr kCols = 5;
+
+  LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
+                           linalg::Vector<float>{{0.5}, {1}, Context::kCpuId}, 1, 3,
+                           MultiStrategy::kMonolithic};
+
+  std::vector<std::unique_ptr<RegTree>> trees;
+  trees.emplace_back(new RegTree{mparam.LeafLength(), mparam.num_feature});
+
+  std::vector<float> p_w(mparam.LeafLength(), 0.0f);
+  std::vector<float> l_w(mparam.LeafLength(), 1.0f);
+  std::vector<float> r_w(mparam.LeafLength(), 2.0f);
+
+  auto &tree = trees.front();
+  tree->ExpandNode(0, static_cast<bst_feature_t>(1), 2.0, true,
+                   linalg::MakeVec(p_w.data(), p_w.size()), linalg::MakeVec(l_w.data(), l_w.size()),
+                   linalg::MakeVec(r_w.data(), r_w.size()));
+  ASSERT_TRUE(tree->IsMultiTarget());
+  ASSERT_TRUE(mparam.IsVectorLeaf());
+
+  gbm::GBTreeModel model{&mparam, ctx};
+  model.CommitModel(std::move(trees), 0);
+
+  auto run_test = [&](float expected, HostDeviceVector<float> *p_data) {
+    {
+      auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
+      PredictionCacheEntry predt_cache;
+      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      ASSERT_EQ(predt_cache.predictions.Size(), kRows * mparam.LeafLength());
+      cpu_predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
+      auto const &h_predt = predt_cache.predictions.HostVector();
+      for (auto v : h_predt) {
+        ASSERT_EQ(v, expected);
+      }
+    }
+
+    {
+      // inplace
+      PredictionCacheEntry predt_cache;
+      auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
+      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      auto arr = GetArrayInterface(p_data, kRows, kCols);
+      std::string str;
+      Json::Dump(arr, &str);
+      auto proxy = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
+      dynamic_cast<data::DMatrixProxy *>(proxy.get())->SetArrayData(str.data());
+      cpu_predictor->InplacePredict(proxy, model, std::numeric_limits<float>::quiet_NaN(),
+                                    &predt_cache, 0, 1);
+      auto const &h_predt = predt_cache.predictions.HostVector();
+      for (auto v : h_predt) {
+        ASSERT_EQ(v, expected);
+      }
+    }
+
+    {
+      // ghist
+      PredictionCacheEntry predt_cache;
+      auto &h_data = p_data->HostVector();
+      // give it at least two bins, otherwise the histogram cuts only have min and max values.
+      for (std::size_t i = 0; i < 5; ++i) {
+        h_data[i] = 1.0;
+      }
+      auto p_fmat = GetDMatrixFromData(p_data->ConstHostVector(), kRows, kCols);
+
+      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+
+      auto iter = NumpyArrayIterForTest{ctx, *p_data, kRows, static_cast<bst_feature_t>(kCols),
+                                        static_cast<std::size_t>(1)};
+      p_fmat =
+          std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
+                                                   std::numeric_limits<float>::quiet_NaN(), 0, 256);
+
+      cpu_predictor->InitOutPredictions(p_fmat->Info(), &predt_cache.predictions, model);
+      cpu_predictor->PredictBatch(p_fmat.get(), &predt_cache, model, 0, 1);
+      auto const &h_predt = predt_cache.predictions.HostVector();
+      // the smallest v uses the min_value from histogram cuts, which leads to a left leaf
+      // during prediction.
+      for (std::size_t i = 5; i < h_predt.size(); ++i) {
+        ASSERT_EQ(h_predt[i], expected) << i;
+      }
+    }
+  };
+
+  // go to right
+  HostDeviceVector<float> data(kRows * kCols, model.trees.front()->SplitCond(RegTree::kRoot) + 1.0);
+  run_test(2.5, &data);
+
+  // go to left
+  data.HostVector().assign(data.Size(), model.trees.front()->SplitCond(RegTree::kRoot) - 1.0);
+  run_test(1.5, &data);
+}
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 61b05b31b..56c1523a1 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -1,9 +1,16 @@
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
+ */
 #ifndef XGBOOST_TEST_PREDICTOR_H_
 #define XGBOOST_TEST_PREDICTOR_H_
 
+#include <xgboost/context.h>  // for Context
 #include <xgboost/predictor.h>
-#include <string>
+
 #include <cstddef>
+#include <string>
+
+#include "../../../src/gbm/gbtree_model.h"  // for GBTreeModel
 #include "../helpers.h"
 
 namespace xgboost {
@@ -48,7 +55,7 @@ void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
     PredictionCacheEntry precise_out_predictions;
     predictor->InitOutPredictions(p_dmat->Info(), &precise_out_predictions.predictions, model);
     predictor->PredictBatch(p_dmat.get(), &precise_out_predictions, model, 0);
-    ASSERT_FALSE(p_dmat->PageExists<Page>());
+    CHECK(!p_dmat->PageExists<Page>());
   }
 }
 
@@ -69,6 +76,8 @@ void TestCategoricalPredictLeaf(StringView name);
 void TestIterationRange(std::string name);
 
 void TestSparsePrediction(float sparsity, std::string predictor);
+
+void TestVectorLeafPrediction(Context const* ctx);
 }  // namespace xgboost
 
 #endif  // XGBOOST_TEST_PREDICTOR_H_

From 910ce580c893dad10b6c041ddb7ec2372fad800f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 14 Mar 2023 22:09:36 +0800
Subject: [PATCH 09/32] Clear all cache after model load. (#8904)

---
 include/xgboost/cache.h           | 12 ++++++++++++
 src/learner.cc                    |  4 ++++
 tests/python/test_basic_models.py | 21 +++++++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/include/xgboost/cache.h b/include/xgboost/cache.h
index 781f45b1c..6195e730c 100644
--- a/include/xgboost/cache.h
+++ b/include/xgboost/cache.h
@@ -116,6 +116,18 @@ class DMatrixCache {
    * \param cache_size Maximum size of the cache.
    */
   explicit DMatrixCache(std::size_t cache_size) : max_size_{cache_size} {}
+
+  DMatrixCache& operator=(DMatrixCache&& that) {
+    CHECK(lock_.try_lock());
+    lock_.unlock();
+    CHECK(that.lock_.try_lock());
+    that.lock_.unlock();
+    std::swap(this->container_, that.container_);
+    std::swap(this->queue_, that.queue_);
+    std::swap(this->max_size_, that.max_size_);
+    return *this;
+  }
+
   /**
    * \brief Cache a new DMatrix if it's not in the cache already.
    *
diff --git a/src/learner.cc b/src/learner.cc
index 62875ead6..d91add70d 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -868,6 +868,8 @@ class LearnerIO : public LearnerConfiguration {
   // Will be removed once JSON takes over.  Right now we still loads some RDS files from R.
   std::string const serialisation_header_ { u8"CONFIG-offset:" };
 
+  void ClearCaches() { this->prediction_container_ = PredictionContainer{}; }
+
  public:
   explicit LearnerIO(std::vector<std::shared_ptr<DMatrix>> cache) : LearnerConfiguration{cache} {}
 
@@ -920,6 +922,7 @@ class LearnerIO : public LearnerConfiguration {
     }
 
     this->need_configuration_ = true;
+    this->ClearCaches();
   }
 
   void SaveModel(Json* p_out) const override {
@@ -1096,6 +1099,7 @@ class LearnerIO : public LearnerConfiguration {
     cfg_.insert(n.cbegin(), n.cend());
 
     this->need_configuration_ = true;
+    this->ClearCaches();
   }
 
   // Save model into binary format.  The code is about to be deprecated by more robust
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 06f666da1..acacc55f8 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -234,6 +234,27 @@ class TestModels:
         xgb.cv(param, dtrain, num_round, nfold=5,
                metrics={'error'}, seed=0, show_stdv=False)
 
+    def test_prediction_cache(self) -> None:
+        X, y = tm.make_sparse_regression(512, 4, 0.5, as_dense=False)
+        Xy = xgb.DMatrix(X, y)
+        param = {"max_depth": 8}
+        booster = xgb.train(param, Xy, num_boost_round=1)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "model.json")
+            booster.save_model(path)
+
+            predt_0 = booster.predict(Xy)
+
+            param["max_depth"] = 2
+
+            booster = xgb.train(param, Xy, num_boost_round=1)
+            predt_1 = booster.predict(Xy)
+            assert not np.isclose(predt_0, predt_1).all()
+
+            booster.load_model(path)
+            predt_2 = booster.predict(Xy)
+            np.testing.assert_allclose(predt_0, predt_2)
+
     def test_feature_names_validation(self):
         X = np.random.random((10, 3))
         y = np.random.randint(2, size=(10,))

From 95e2baf7c278ef23c9cda21d22e3df5df4c1a2f1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 15 Mar 2023 00:55:17 +0800
Subject: [PATCH 10/32] [doc] Fix typo [skip ci] (#8907)

---
 doc/tutorials/dask.rst | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 6608a8594..ba0da9089 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -575,27 +575,26 @@ allocation or have different amount of available resources during different
 sessions. There are heuristics and guidelines on how to achieve it but no proven method
 for guaranteeing such deterministic behavior. The Dask interface in XGBoost tries to
 provide reproducible result with best effort. This section highlights some known criteria
-and try share some insight into the issue.
+and try to share some insights into the issue.
 
 There are primarily two different tasks for XGBoost the carry out, training and
 inference. Inference is reproducible given the same software and hardware along with the
-same run-time configurations like number of threads. The remaining of this section will
-focus on training.
+same run-time configurations. The remaining of this section will focus on training.
 
 Many of the challenges come from the fact that we are using approximation algorithms, The
 sketching algorithm used to find histogram bins is an approximation to the exact quantile
 algorithm, the `AUC` metric in a distributed environment is an approximation to the exact
-`AUC` score, and floating-point number if an approximation to real numbers. Floating point
+`AUC` score, and floating-point number is an approximation to real number. Floating-point
 is an issue as its summation is not associative, meaning :math:`(a + b) + c` does not
 necessarily equal to :math:`a + (b + c)`, even though this property holds true for real
-number. As a result, whenever we change the order of summation, the result can
+number. As a result, whenever we change the order of a summation, the result can
 differ. This imposes the requirement that, in order to have reproducible output from
 XGBoost, the entire pipeline needs to be reproducible.
 
 - The software stack is the same for each runs. This goes without saying. XGBoost might
   generate different outputs between different versions. This is expected as we might
   change the default value of hyper-parameter, or the parallel strategy that generates
-  different floating point result. We guarantee the correctness the algorithms, but there
+  different floating-point result. We guarantee the correctness the algorithms, but there
   are lots of wiggle room for the final output. The situation is similar for many
   dependencies, for instance, the random number generator might differ from platform to
   platform.
@@ -622,10 +621,10 @@ XGBoost, the entire pipeline needs to be reproducible.
 
 - The operations performed on dataframes need to be reproducible. There are some
   operations like `DataFrame.merge` not being deterministic on parallel hardwares like GPU
-  where the order of the index of merge result might differ from run to run.
+  where the order of the index might differ from run to run.
 
-It's expected to have different results when training the model on distributed environment
-than training the model using a single node due to aforementioned criteria.
+It's expected to have different results when training the model in a distributed
+environment than training the model using a single node due to aforementioned criteria.
 
 
 ************

From 8685556af22c5a47f47d77d12c12c1d0ea445b22 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 15 Mar 2023 01:42:51 +0800
Subject: [PATCH 11/32] Implement hist evaluator for multi-target tree. (#8908)

---
 src/common/hist_util.h                        |  22 +-
 src/tree/common_row_partitioner.h             |  21 +-
 src/tree/hist/evaluate_splits.h               | 245 ++++++++++++++++--
 src/tree/param.h                              |  81 ++++--
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu |   2 +-
 tests/cpp/tree/hist/test_evaluate_splits.cc   |  96 ++++++-
 tests/cpp/tree/test_evaluate_splits.h         |  25 +-
 7 files changed, 416 insertions(+), 76 deletions(-)

diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index c09e5c71a..d95d405eb 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -7,23 +7,22 @@
 #ifndef XGBOOST_COMMON_HIST_UTIL_H_
 #define XGBOOST_COMMON_HIST_UTIL_H_
 
-#include <xgboost/data.h>
-
 #include <algorithm>
+#include <cstdint>  // for uint32_t
 #include <limits>
 #include <map>
 #include <memory>
 #include <utility>
 #include <vector>
 
-#include "algorithm.h"  // SegmentId
 #include "categorical.h"
 #include "common.h"
 #include "quantile.h"
 #include "row_set.h"
 #include "threading_utils.h"
 #include "timer.h"
-#include "xgboost/base.h"  // bst_feature_t, bst_bin_t
+#include "xgboost/base.h"  // for bst_feature_t, bst_bin_t
+#include "xgboost/data.h"
 
 namespace xgboost {
 class GHistIndexMatrix;
@@ -392,15 +391,18 @@ class HistCollection {
   }
 
   // have we computed a histogram for i-th node?
-  bool RowExists(bst_uint nid) const {
+  [[nodiscard]] bool RowExists(bst_uint nid) const {
     const uint32_t k_max = std::numeric_limits<uint32_t>::max();
     return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
   }
-
-  // initialize histogram collection
-  void Init(uint32_t nbins) {
-    if (nbins_ != nbins) {
-      nbins_ = nbins;
+  /**
+   * \brief Initialize histogram collection.
+   *
+   * \param n_total_bins Number of bins across all features.
+   */
+  void Init(std::uint32_t n_total_bins) {
+    if (nbins_ != n_total_bins) {
+      nbins_ = n_total_bins;
       // quite expensive operation, so let's do this only once
       data_.clear();
     }
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index 3a46a168a..a58dbb452 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -99,22 +99,25 @@ class CommonRowPartitioner {
 
   void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
                            const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions) {
-    for (size_t i = 0; i < nodes.size(); ++i) {
-      const int32_t nid = nodes[i].nid;
-      const bst_uint fid = tree[nid].SplitIndex();
-      const bst_float split_pt = tree[nid].SplitCond();
-      const uint32_t lower_bound = gmat.cut.Ptrs()[fid];
-      const uint32_t upper_bound = gmat.cut.Ptrs()[fid + 1];
+    auto const& ptrs = gmat.cut.Ptrs();
+    auto const& vals = gmat.cut.Values();
+
+    for (std::size_t i = 0; i < nodes.size(); ++i) {
+      bst_node_t const nid = nodes[i].nid;
+      bst_feature_t const fid = tree[nid].SplitIndex();
+      const float split_pt = tree[nid].SplitCond();
+      const uint32_t lower_bound = ptrs[fid];
+      const uint32_t upper_bound = ptrs[fid + 1];
       bst_bin_t split_cond = -1;
       // convert floating-point split_pt into corresponding bin_id
       // split_cond = -1 indicates that split_pt is less than all known cut points
       CHECK_LT(upper_bound, static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
       for (auto bound = lower_bound; bound < upper_bound; ++bound) {
-        if (split_pt == gmat.cut.Values()[bound]) {
-          split_cond = static_cast<int32_t>(bound);
+        if (split_pt == vals[bound]) {
+          split_cond = static_cast<bst_bin_t>(bound);
         }
       }
-      (*split_conditions).at(i) = split_cond;
+      (*split_conditions)[i] = split_cond;
     }
   }
 
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 31a61fb9d..925a5fb76 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -4,22 +4,25 @@
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 
-#include <algorithm>
-#include <cstddef>  // for size_t
-#include <limits>
-#include <memory>
-#include <numeric>
-#include <utility>
-#include <vector>
+#include <algorithm>                   // for copy
+#include <cstddef>                     // for size_t
+#include <limits>                      // for numeric_limits
+#include <memory>                      // for shared_ptr
+#include <numeric>                     // for accumulate
+#include <utility>                     // for move
+#include <vector>                      // for vector
 
-#include "../../common/categorical.h"
-#include "../../common/hist_util.h"
-#include "../../common/random.h"
-#include "../../data/gradient_index.h"
-#include "../constraints.h"
-#include "../param.h"  // for TrainParam
-#include "../split_evaluator.h"
-#include "xgboost/context.h"
+#include "../../common/categorical.h"  // for CatBitField
+#include "../../common/hist_util.h"    // for GHistRow, HistogramCuts
+#include "../../common/linalg_op.h"    // for cbegin, cend, begin
+#include "../../common/random.h"       // for ColumnSampler
+#include "../constraints.h"            // for FeatureInteractionConstraintHost
+#include "../param.h"                  // for TrainParam
+#include "../split_evaluator.h"        // for TreeEvaluator
+#include "expand_entry.h"              // for MultiExpandEntry
+#include "xgboost/base.h"              // for bst_node_t, bst_target_t, bst_feature_t
+#include "xgboost/context.h"           // for COntext
+#include "xgboost/linalg.h"            // for Constants, Vector
 
 namespace xgboost::tree {
 template <typename ExpandEntry>
@@ -410,8 +413,6 @@ class HistEvaluator {
                              tree[candidate.nid].SplitIndex(), left_weight,
                              right_weight);
 
-    auto max_node = std::max(left_child, tree[candidate.nid].RightChild());
-    max_node = std::max(candidate.nid, max_node);
     snode_.resize(tree.GetNodes().size());
     snode_.at(left_child).stats = candidate.split.left_sum;
     snode_.at(left_child).root_gain =
@@ -456,6 +457,216 @@ class HistEvaluator {
   }
 };
 
+class HistMultiEvaluator {
+  std::vector<double> gain_;
+  linalg::Matrix<GradientPairPrecise> stats_;
+  TrainParam const *param_;
+  FeatureInteractionConstraintHost interaction_constraints_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
+  Context const *ctx_;
+
+ private:
+  static double MultiCalcSplitGain(TrainParam const &param,
+                                   linalg::VectorView<GradientPairPrecise const> left_sum,
+                                   linalg::VectorView<GradientPairPrecise const> right_sum,
+                                   linalg::VectorView<float> left_weight,
+                                   linalg::VectorView<float> right_weight) {
+    CalcWeight(param, left_sum, left_weight);
+    CalcWeight(param, right_sum, right_weight);
+
+    auto left_gain = CalcGainGivenWeight(param, left_sum, left_weight);
+    auto right_gain = CalcGainGivenWeight(param, right_sum, right_weight);
+    return left_gain + right_gain;
+  }
+
+  template <bst_bin_t d_step>
+  bool EnumerateSplit(common::HistogramCuts const &cut, bst_feature_t fidx,
+                      common::Span<common::GHistRow const> hist,
+                      linalg::VectorView<GradientPairPrecise const> parent_sum, double parent_gain,
+                      SplitEntryContainer<std::vector<GradientPairPrecise>> *p_best) const {
+    auto const &cut_ptr = cut.Ptrs();
+    auto const &cut_val = cut.Values();
+    auto const &min_val = cut.MinValues();
+
+    auto sum = linalg::Empty<GradientPairPrecise>(ctx_, 2, hist.size());
+    auto left_sum = sum.Slice(0, linalg::All());
+    auto right_sum = sum.Slice(1, linalg::All());
+
+    bst_bin_t ibegin, iend;
+    if (d_step > 0) {
+      ibegin = static_cast<bst_bin_t>(cut_ptr[fidx]);
+      iend = static_cast<bst_bin_t>(cut_ptr[fidx + 1]);
+    } else {
+      ibegin = static_cast<bst_bin_t>(cut_ptr[fidx + 1]) - 1;
+      iend = static_cast<bst_bin_t>(cut_ptr[fidx]) - 1;
+    }
+    const auto imin = static_cast<bst_bin_t>(cut_ptr[fidx]);
+
+    auto n_targets = hist.size();
+    auto weight = linalg::Empty<float>(ctx_, 2, n_targets);
+    auto left_weight = weight.Slice(0, linalg::All());
+    auto right_weight = weight.Slice(1, linalg::All());
+
+    for (bst_bin_t i = ibegin; i != iend; i += d_step) {
+      for (bst_target_t t = 0; t < n_targets; ++t) {
+        auto t_hist = hist[t];
+        auto t_p = parent_sum(t);
+        left_sum(t) += t_hist[i];
+        right_sum(t) = t_p - left_sum(t);
+      }
+
+      if (d_step > 0) {
+        auto split_pt = cut_val[i];
+        auto loss_chg =
+            MultiCalcSplitGain(*param_, right_sum, left_sum, right_weight, left_weight) -
+            parent_gain;
+        p_best->Update(loss_chg, fidx, split_pt, d_step == -1, false, left_sum, right_sum);
+      } else {
+        float split_pt;
+        if (i == imin) {
+          split_pt = min_val[fidx];
+        } else {
+          split_pt = cut_val[i - 1];
+        }
+        auto loss_chg =
+            MultiCalcSplitGain(*param_, right_sum, left_sum, left_weight, right_weight) -
+            parent_gain;
+        p_best->Update(loss_chg, fidx, split_pt, d_step == -1, false, right_sum, left_sum);
+      }
+    }
+    // return true if there's missing. Doesn't handle floating-point error well.
+    if (d_step == +1) {
+      return !std::equal(linalg::cbegin(left_sum), linalg::cend(left_sum),
+                         linalg::cbegin(parent_sum));
+    }
+    return false;
+  }
+
+ public:
+  void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
+                      common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
+    auto &entries = *p_entries;
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
+
+    for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+      auto nidx = entries[nidx_in_set].nid;
+      features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
+    }
+    CHECK(!features.empty());
+
+    std::int32_t n_threads = ctx_->Threads();
+    std::size_t const grain_size = std::max<std::size_t>(1, features.front()->Size() / n_threads);
+    common::BlockedSpace2d space(
+        entries.size(), [&](std::size_t nidx_in_set) { return features[nidx_in_set]->Size(); },
+        grain_size);
+
+    std::vector<MultiExpandEntry> tloc_candidates(n_threads * entries.size());
+    for (std::size_t i = 0; i < entries.size(); ++i) {
+      for (std::int32_t j = 0; j < n_threads; ++j) {
+        tloc_candidates[i * n_threads + j] = entries[i];
+      }
+    }
+    common::ParallelFor2d(space, n_threads, [&](std::size_t nidx_in_set, common::Range1d r) {
+      auto tidx = omp_get_thread_num();
+      auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
+      auto best = &entry->split;
+      auto parent_sum = stats_.Slice(entry->nid, linalg::All());
+      std::vector<common::GHistRow> node_hist;
+      for (auto t_hist : hist) {
+        node_hist.push_back((*t_hist)[entry->nid]);
+      }
+      auto features_set = features[nidx_in_set]->ConstHostSpan();
+
+      for (auto fidx_in_set = r.begin(); fidx_in_set < r.end(); fidx_in_set++) {
+        auto fidx = features_set[fidx_in_set];
+        if (!interaction_constraints_.Query(entry->nid, fidx)) {
+          continue;
+        }
+        auto parent_gain = gain_[entry->nid];
+        bool missing =
+            this->EnumerateSplit<+1>(cut, fidx, node_hist, parent_sum, parent_gain, best);
+        if (missing) {
+          this->EnumerateSplit<-1>(cut, fidx, node_hist, parent_sum, parent_gain, best);
+        }
+      }
+    });
+
+    for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+      for (auto tidx = 0; tidx < n_threads; ++tidx) {
+        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
+      }
+    }
+  }
+
+  linalg::Vector<float> InitRoot(linalg::VectorView<GradientPairPrecise const> root_sum) {
+    auto n_targets = root_sum.Size();
+    stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
+    gain_.resize(1);
+
+    linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
+    CalcWeight(*param_, root_sum, weight.HostView());
+    auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
+    gain_.front() = root_gain;
+
+    auto h_stats = stats_.HostView();
+    std::copy(linalg::cbegin(root_sum), linalg::cend(root_sum), linalg::begin(h_stats));
+
+    return weight;
+  }
+
+  void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
+    auto n_targets = p_tree->NumTargets();
+    auto parent_sum = stats_.Slice(candidate.nid, linalg::All());
+
+    auto weight = linalg::Empty<float>(ctx_, 3, n_targets);
+    auto base_weight = weight.Slice(0, linalg::All());
+    CalcWeight(*param_, parent_sum, base_weight);
+
+    auto left_weight = weight.Slice(1, linalg::All());
+    auto left_sum =
+        linalg::MakeVec(candidate.split.left_sum.data(), candidate.split.left_sum.size());
+    CalcWeight(*param_, left_sum, param_->learning_rate, left_weight);
+
+    auto right_weight = weight.Slice(2, linalg::All());
+    auto right_sum =
+        linalg::MakeVec(candidate.split.right_sum.data(), candidate.split.right_sum.size());
+    CalcWeight(*param_, right_sum, param_->learning_rate, right_weight);
+
+    p_tree->ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
+                       candidate.split.DefaultLeft(), base_weight, left_weight, right_weight);
+    CHECK(p_tree->IsMultiTarget());
+    auto left_child = p_tree->LeftChild(candidate.nid);
+    CHECK_GT(left_child, candidate.nid);
+    auto right_child = p_tree->RightChild(candidate.nid);
+    CHECK_GT(right_child, candidate.nid);
+
+    std::size_t n_nodes = p_tree->Size();
+    gain_.resize(n_nodes);
+    gain_[left_child] = CalcGainGivenWeight(*param_, left_sum, left_weight);
+    gain_[right_child] = CalcGainGivenWeight(*param_, right_sum, right_weight);
+
+    if (n_nodes >= stats_.Shape(0)) {
+      stats_.Reshape(n_nodes * 2, stats_.Shape(1));
+    }
+    CHECK_EQ(stats_.Shape(1), n_targets);
+    auto left_sum_stat = stats_.Slice(left_child, linalg::All());
+    std::copy(candidate.split.left_sum.cbegin(), candidate.split.left_sum.cend(),
+              linalg::begin(left_sum_stat));
+    auto right_sum_stat = stats_.Slice(right_child, linalg::All());
+    std::copy(candidate.split.right_sum.cbegin(), candidate.split.right_sum.cend(),
+              linalg::begin(right_sum_stat));
+  }
+
+  explicit HistMultiEvaluator(Context const *ctx, MetaInfo const &info, TrainParam const *param,
+                              std::shared_ptr<common::ColumnSampler> sampler)
+      : param_{param}, column_sampler_{std::move(sampler)}, ctx_{ctx} {
+    interaction_constraints_.Configure(*param, info.num_col_);
+    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
+                          param_->colsample_bynode, param_->colsample_bylevel,
+                          param_->colsample_bytree);
+  }
+};
+
 /**
  * \brief CPU implementation of update prediction cache, which calculates the leaf value
  *        for the last tree and accumulates it to prediction vector.
diff --git a/src/tree/param.h b/src/tree/param.h
index 98895e5a2..0d59a5c35 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -14,10 +14,12 @@
 #include <string>
 #include <vector>
 
-#include "xgboost/parameter.h"
-#include "xgboost/data.h"
 #include "../common/categorical.h"
+#include "../common/linalg_op.h"
 #include "../common/math.h"
+#include "xgboost/data.h"
+#include "xgboost/linalg.h"
+#include "xgboost/parameter.h"
 
 namespace xgboost {
 namespace tree {
@@ -197,12 +199,11 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
   }
 
   /*! \brief given the loss change, whether we need to invoke pruning */
-  bool NeedPrune(double loss_chg, int depth) const {
-    return loss_chg < this->min_split_loss ||
-           (this->max_depth != 0 && depth > this->max_depth);
+  [[nodiscard]] bool NeedPrune(double loss_chg, int depth) const {
+    return loss_chg < this->min_split_loss || (this->max_depth != 0 && depth > this->max_depth);
   }
 
-  bst_node_t MaxNodes() const {
+  [[nodiscard]] bst_node_t MaxNodes() const {
     if (this->max_depth == 0 && this->max_leaves == 0) {
       LOG(FATAL) << "Max leaves and max depth cannot both be unconstrained.";
     }
@@ -292,6 +293,34 @@ XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad)
   return CalcWeight(p, sum_grad.GetGrad(), sum_grad.GetHess());
 }
 
+/**
+ * \brief multi-target weight, calculated with learning rate.
+ */
+inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
+                       float eta, linalg::VectorView<float> out_w) {
+  for (bst_target_t i = 0; i < out_w.Size(); ++i) {
+    out_w(i) = CalcWeight(p, grad_sum(i).GetGrad(), grad_sum(i).GetHess()) * eta;
+  }
+}
+
+/**
+ * \brief multi-target weight
+ */
+inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
+                       linalg::VectorView<float> out_w) {
+  return CalcWeight(p, grad_sum, 1.0f, out_w);
+}
+
+inline double CalcGainGivenWeight(TrainParam const &p,
+                                  linalg::VectorView<GradientPairPrecise const> sum_grad,
+                                  linalg::VectorView<float const> weight) {
+  double gain{0};
+  for (bst_target_t i = 0; i < weight.Size(); ++i) {
+    gain += -weight(i) * ThresholdL1(sum_grad(i).GetGrad(), p.reg_alpha);
+  }
+  return gain;
+}
+
 /*! \brief core statistics used for tree construction */
 struct XGBOOST_ALIGNAS(16) GradStats {
   using GradType = double;
@@ -301,8 +330,8 @@ struct XGBOOST_ALIGNAS(16) GradStats {
   GradType sum_hess { 0 };
 
  public:
-  XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; }
-  XGBOOST_DEVICE GradType GetHess() const { return sum_hess; }
+  [[nodiscard]] XGBOOST_DEVICE GradType GetGrad() const { return sum_grad; }
+  [[nodiscard]] XGBOOST_DEVICE GradType GetHess() const { return sum_hess; }
 
   friend std::ostream& operator<<(std::ostream& os, GradStats s) {
     os << s.GetGrad() << "/" << s.GetHess();
@@ -340,7 +369,7 @@ struct XGBOOST_ALIGNAS(16) GradStats {
     sum_hess = a.sum_hess - b.sum_hess;
   }
   /*! \return whether the statistics is not used yet */
-  inline bool Empty() const { return sum_hess == 0.0; }
+  [[nodiscard]] bool Empty() const { return sum_hess == 0.0; }
   /*! \brief add statistics to the data */
   inline void Add(GradType grad, GradType hess) {
     sum_grad += grad;
@@ -348,6 +377,19 @@ struct XGBOOST_ALIGNAS(16) GradStats {
   }
 };
 
+// Helper functions for copying gradient statistic, one for vector leaf, another for normal scalar.
+template <typename T, typename U>
+std::vector<T> &CopyStats(linalg::VectorView<U> const &src, std::vector<T> *dst) {  // NOLINT
+  dst->resize(src.Size());
+  std::copy(linalg::cbegin(src), linalg::cend(src), dst->begin());
+  return *dst;
+}
+
+inline GradStats &CopyStats(GradStats const &src, GradStats *dst) {  // NOLINT
+  *dst = src;
+  return *dst;
+}
+
 /*!
  * \brief statistics that is helpful to store
  *   and represent a split solution for the tree
@@ -378,9 +420,9 @@ struct SplitEntryContainer {
     return os;
   }
   /*!\return feature index to split on */
-  bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
+  [[nodiscard]] bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); }
   /*!\return whether missing value goes to left branch */
-  bool DefaultLeft() const { return (sindex >> 31) != 0; }
+  [[nodiscard]] bool DefaultLeft() const { return (sindex >> 31) != 0; }
   /*!
    * \brief decides whether we can replace current entry with the given statistics
    *
@@ -391,10 +433,10 @@ struct SplitEntryContainer {
    * \param new_loss_chg the loss reduction get through the split
    * \param split_index the feature index where the split is on
    */
-  bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
+  [[nodiscard]] bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
     if (std::isinf(new_loss_chg)) {  // in some cases new_loss_chg can be NaN or Inf,
-                                         // for example when lambda = 0 & min_child_weight = 0
-                                         // skip value in this case
+                                     // for example when lambda = 0 & min_child_weight = 0
+                                     // skip value in this case
       return false;
     } else if (this->SplitIndex() <= split_index) {
       return new_loss_chg > this->loss_chg;
@@ -429,9 +471,10 @@ struct SplitEntryContainer {
    * \param default_left whether the missing value goes to left
    * \return whether the proposed split is better and can replace current split
    */
-  bool Update(bst_float new_loss_chg, unsigned split_index,
-              bst_float new_split_value, bool default_left, bool is_cat,
-              const GradientT &left_sum, const GradientT &right_sum) {
+  template <typename GradientSumT>
+  bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
+              bool default_left, bool is_cat, GradientSumT const &left_sum,
+              GradientSumT const &right_sum) {
     if (this->NeedReplace(new_loss_chg, split_index)) {
       this->loss_chg = new_loss_chg;
       if (default_left) {
@@ -440,8 +483,8 @@ struct SplitEntryContainer {
       this->sindex = split_index;
       this->split_value = new_split_value;
       this->is_cat = is_cat;
-      this->left_sum = left_sum;
-      this->right_sum = right_sum;
+      CopyStats(left_sum, &this->left_sum);
+      CopyStats(right_sum, &this->right_sum);
       return true;
     } else {
       return false;
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 4582f546a..f1317fc02 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -304,7 +304,7 @@ void TestEvaluateSingleSplit(bool is_categorical) {
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
-  auto feature_histogram = ConvertToInteger({          {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(),
                                                FeatureType::kCategorical);
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index cf9d78f52..dcd04f68a 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -1,18 +1,27 @@
 /**
  * Copyright 2021-2023 by XGBoost Contributors
  */
-#include <gtest/gtest.h>
-#include <xgboost/base.h>
-
-#include "../../../../src/common/hist_util.h"
-#include "../../../../src/tree/common_row_partitioner.h"
-#include "../../../../src/tree/hist/evaluate_splits.h"
 #include "../test_evaluate_splits.h"
-#include "../../helpers.h"
-#include "xgboost/context.h"  // Context
 
-namespace xgboost {
-namespace tree {
+#include <gtest/gtest.h>
+#include <xgboost/base.h>                               // for GradientPairPrecise, Args, Gradie...
+#include <xgboost/context.h>                            // for Context
+#include <xgboost/data.h>                               // for FeatureType, DMatrix, MetaInfo
+#include <xgboost/logging.h>                            // for CHECK_EQ
+#include <xgboost/tree_model.h>                         // for RegTree, RTreeNodeStat
+
+#include <memory>                                       // for make_shared, shared_ptr, addressof
+
+#include "../../../../src/common/hist_util.h"           // for HistCollection, HistogramCuts
+#include "../../../../src/common/random.h"              // for ColumnSampler
+#include "../../../../src/common/row_set.h"             // for RowSetCollection
+#include "../../../../src/data/gradient_index.h"        // for GHistIndexMatrix
+#include "../../../../src/tree/hist/evaluate_splits.h"  // for HistEvaluator
+#include "../../../../src/tree/hist/expand_entry.h"     // for CPUExpandEntry
+#include "../../../../src/tree/param.h"                 // for GradStats, TrainParam
+#include "../../helpers.h"                              // for RandomDataGenerator, AllThreadsFo...
+
+namespace xgboost::tree {
 void TestEvaluateSplits(bool force_read_by_column) {
   Context ctx;
   ctx.nthread = 4;
@@ -87,6 +96,68 @@ TEST(HistEvaluator, Evaluate) {
   TestEvaluateSplits(true);
 }
 
+TEST(HistMultiEvaluator, Evaluate) {
+  Context ctx;
+  ctx.nthread = 1;
+
+  TrainParam param;
+  param.Init(Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}});
+  auto sampler = std::make_shared<common::ColumnSampler>();
+
+  std::size_t n_samples = 3;
+  bst_feature_t n_features = 2;
+  bst_target_t n_targets = 2;
+  bst_bin_t n_bins = 2;
+
+  auto p_fmat =
+      RandomDataGenerator{n_samples, n_features, 0.5}.Targets(n_targets).GenerateDMatrix(true);
+
+  HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), &param, sampler};
+  std::vector<common::HistCollection> histogram(n_targets);
+  linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
+  for (bst_target_t t{0}; t < n_targets; ++t) {
+    auto &hist = histogram[t];
+    hist.Init(n_bins * n_features);
+    hist.AddHistRow(0);
+    hist.AllocateAllData();
+    auto node_hist = hist[0];
+    node_hist[0] = {-0.5, 0.5};
+    node_hist[1] = {2.0, 0.5};
+    node_hist[2] = {0.5, 0.5};
+    node_hist[3] = {1.0, 0.5};
+
+    root_sum(t) += node_hist[0];
+    root_sum(t) += node_hist[1];
+  }
+
+  RegTree tree{n_targets, n_features};
+  auto weight = evaluator.InitRoot(root_sum.HostView());
+  tree.SetLeaf(RegTree::kRoot, weight.HostView());
+  auto w = weight.HostView();
+  ASSERT_EQ(w.Size(), n_targets);
+  ASSERT_EQ(w(0), -1.5);
+  ASSERT_EQ(w(1), -1.5);
+
+  common::HistogramCuts cuts;
+  cuts.cut_ptrs_ = {0, 2, 4};
+  cuts.cut_values_ = {0.5, 1.0, 2.0, 3.0};
+  cuts.min_vals_ = {-0.2, 1.8};
+
+  std::vector<MultiExpandEntry> entries(1, {/*nidx=*/0, /*depth=*/0});
+
+  std::vector<common::HistCollection const *> ptrs;
+  std::transform(histogram.cbegin(), histogram.cend(), std::back_inserter(ptrs),
+                 [](auto const &h) { return std::addressof(h); });
+
+  evaluator.EvaluateSplits(tree, ptrs, cuts, &entries);
+
+  ASSERT_EQ(entries.front().split.loss_chg, 12.5);
+  ASSERT_EQ(entries.front().split.split_value, 0.5);
+  ASSERT_EQ(entries.front().split.SplitIndex(), 0);
+
+  ASSERT_EQ(sampler->GetFeatureSet(0)->Size(), n_features);
+}
+
 TEST(HistEvaluator, Apply) {
   Context ctx;
   ctx.nthread = 4;
@@ -211,12 +282,11 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
   std::vector<CPUExpandEntry> entries(1);
   RegTree tree;
   evaluator.EvaluateSplits(hist, cuts_, info.feature_types.ConstHostSpan(), tree, &entries);
-  auto const& split = entries.front().split;
+  auto const &split = entries.front().split;
 
   this->CheckResult(split.loss_chg, split.SplitIndex(), split.split_value, split.is_cat,
                     split.DefaultLeft(),
                     GradientPairPrecise{split.left_sum.GetGrad(), split.left_sum.GetHess()},
                     GradientPairPrecise{split.right_sum.GetGrad(), split.right_sum.GetHess()});
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index a74739faa..a7e8972e5 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -2,15 +2,26 @@
  * Copyright 2022-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>
+#include <xgboost/base.h>                       // for GradientPairInternal, GradientPairPrecise
+#include <xgboost/data.h>                       // for MetaInfo
+#include <xgboost/host_device_vector.h>         // for HostDeviceVector
+#include <xgboost/span.h>                       // for operator!=, Span, SpanIterator
 
-#include <algorithm>  // next_permutation
-#include <numeric>    // iota
+#include <algorithm>                            // for max, max_element, next_permutation, copy
+#include <cmath>                                // for isnan
+#include <cstddef>                              // for size_t
+#include <cstdint>                              // for int32_t, uint64_t, uint32_t
+#include <limits>                               // for numeric_limits
+#include <numeric>                              // for iota
+#include <tuple>                                // for make_tuple, tie, tuple
+#include <utility>                              // for pair
+#include <vector>                               // for vector
 
-#include "../../../src/common/hist_util.h"  // HistogramCuts,HistCollection
-#include "../../../src/tree/param.h"        // TrainParam
-#include "../../../src/tree/split_evaluator.h"
-#include "../helpers.h"
+#include "../../../src/common/hist_util.h"      // for HistogramCuts, HistCollection, GHistRow
+#include "../../../src/tree/param.h"            // for TrainParam, GradStats
+#include "../../../src/tree/split_evaluator.h"  // for TreeEvaluator
+#include "../helpers.h"                         // for SimpleLCG, SimpleRealUniformDistribution
+#include "gtest/gtest_pred_impl.h"              // for AssertionResult, ASSERT_EQ, ASSERT_TRUE
 
 namespace xgboost::tree {
 /**

From 72e8331eabb0b93a0859e44d827d31a168a4ec9d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 15 Mar 2023 03:26:17 +0800
Subject: [PATCH 12/32] Reimplement the NDCG metric. (#8906)

- Add support for non-exp gain.
- Cache the DMatrix object to avoid re-calculating the IDCG.
- Make GPU implementation deterministic. (no atomic add)
---
 include/xgboost/cache.h              |  20 +++
 src/metric/rank_metric.cc            | 238 +++++++++++++++++++++------
 src/metric/rank_metric.cu            | 152 +++++++----------
 src/metric/rank_metric.h             |  33 ++++
 tests/cpp/metric/test_rank_metric.cc |  80 +++++++--
 5 files changed, 363 insertions(+), 160 deletions(-)
 create mode 100644 src/metric/rank_metric.h

diff --git a/include/xgboost/cache.h b/include/xgboost/cache.h
index 6195e730c..32e1b21ac 100644
--- a/include/xgboost/cache.h
+++ b/include/xgboost/cache.h
@@ -161,6 +161,26 @@ class DMatrixCache {
     }
     return container_.at(key).value;
   }
+  /**
+   * \brief Re-initialize the item in cache.
+   *
+   *   Since the shared_ptr is used to hold the item, any reference that lives outside of
+   *   the cache can no-longer be reached from the cache.
+   *
+   *   We use reset instead of erase to avoid walking through the whole cache for renewing
+   *   a single item. (the cache is FIFO, needs to maintain the order).
+   */
+  template <typename... Args>
+  std::shared_ptr<CacheT> ResetItem(std::shared_ptr<DMatrix> m, Args const&... args) {
+    std::lock_guard<std::mutex> guard{lock_};
+    CheckConsistent();
+    auto key = Key{m.get(), std::this_thread::get_id()};
+    auto it = container_.find(key);
+    CHECK(it != container_.cend());
+    it->second = {m, std::make_shared<CacheT>(args...)};
+    CheckConsistent();
+    return it->second.value;
+  }
   /**
    * \brief Get a const reference to the underlying hash map.  Clear expired caches before
    *        returning.
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 69e6e24cd..c2aa48cab 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -20,23 +20,51 @@
 //   corresponding headers that brings in those function declaration can't be included with CUDA).
 //   This precludes the CPU and GPU logic to coexist inside a .cu file
 
+#include "rank_metric.h"
+
+#include <dmlc/omp.h>
 #include <dmlc/registry.h>
-#include <xgboost/metric.h>
 
-#include <cmath>
-#include <vector>
+#include <algorithm>                         // for stable_sort, copy, fill_n, min, max
+#include <array>                             // for array
+#include <cmath>                             // for log, sqrt
+#include <cstddef>                           // for size_t, std
+#include <cstdint>                           // for uint32_t
+#include <functional>                        // for less, greater
+#include <map>                               // for operator!=, _Rb_tree_const_iterator
+#include <memory>                            // for allocator, unique_ptr, shared_ptr, __shared_...
+#include <numeric>                           // for accumulate
+#include <ostream>                           // for operator<<, basic_ostream, ostringstream
+#include <string>                            // for char_traits, operator<, basic_string, to_string
+#include <utility>                           // for pair, make_pair
+#include <vector>                            // for vector
 
-#include "../collective/communicator-inl.h"
-#include "../common/algorithm.h"  // Sort
-#include "../common/math.h"
-#include "../common/ranking_utils.h"  // MakeMetricName
-#include "../common/threading_utils.h"
-#include "metric_common.h"
-#include "xgboost/host_device_vector.h"
+#include "../collective/communicator-inl.h"  // for IsDistributed, Allreduce
+#include "../collective/communicator.h"      // for Operation
+#include "../common/algorithm.h"             // for ArgSort, Sort
+#include "../common/linalg_op.h"             // for cbegin, cend
+#include "../common/math.h"                  // for CmpFirst
+#include "../common/optional_weight.h"       // for OptionalWeights, MakeOptionalWeights
+#include "../common/ranking_utils.h"         // for LambdaRankParam, NDCGCache, ParseMetricName
+#include "../common/threading_utils.h"       // for ParallelFor
+#include "../common/transform_iterator.h"    // for IndexTransformIter
+#include "dmlc/common.h"                     // for OMPException
+#include "metric_common.h"                   // for MetricNoCache, GPUMetric, PackedReduceResult
+#include "xgboost/base.h"                    // for bst_float, bst_omp_uint, bst_group_t, Args
+#include "xgboost/cache.h"                   // for DMatrixCache
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for MetaInfo, DMatrix
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/json.h"                    // for Json, FromJson, IsA, ToJson, get, Null, Object
+#include "xgboost/linalg.h"                  // for Tensor, TensorView, Range, VectorView, MakeT...
+#include "xgboost/logging.h"                 // for CHECK, ConsoleLogger, LOG_INFO, CHECK_EQ
+#include "xgboost/metric.h"                  // for MetricReg, XGBOOST_REGISTER_METRIC, Metric
+#include "xgboost/span.h"                    // for Span, operator!=
+#include "xgboost/string_view.h"             // for StringView
 
 namespace {
 
-using PredIndPair = std::pair<xgboost::bst_float, uint32_t>;
+using PredIndPair = std::pair<xgboost::bst_float, xgboost::ltr::rel_degree_t>;
 using PredIndPairContainer = std::vector<PredIndPair>;
 
 /*
@@ -87,8 +115,7 @@ class PerGroupWeightPolicy {
 
 }  // anonymous namespace
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric);
 
@@ -257,40 +284,6 @@ struct EvalPrecision : public EvalRank {
   }
 };
 
-/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
-struct EvalNDCG : public EvalRank {
- private:
-  double CalcDCG(const PredIndPairContainer &rec) const {
-    double sumdcg = 0.0;
-    for (size_t i = 0; i < rec.size() && i < this->topn; ++i) {
-      const unsigned rel = rec[i].second;
-      if (rel != 0) {
-        sumdcg += ((1 << rel) - 1) / std::log2(i + 2.0);
-      }
-    }
-    return sumdcg;
-  }
-
- public:
-  explicit EvalNDCG(const char* name, const char* param) : EvalRank(name, param) {}
-
-  double EvalGroup(PredIndPairContainer *recptr) const override {
-    PredIndPairContainer &rec(*recptr);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-    double dcg = CalcDCG(rec);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpSecond);
-    double idcg = CalcDCG(rec);
-    if (idcg == 0.0f) {
-      if (this->minus) {
-        return 0.0f;
-      } else {
-        return 1.0f;
-      }
-    }
-    return dcg/idcg;
-  }
-};
-
 /*! \brief Mean Average Precision at N, for both classification and rank */
 struct EvalMAP : public EvalRank {
  public:
@@ -377,10 +370,6 @@ XGBOOST_REGISTER_METRIC(Precision, "pre")
 .describe("precision@k for rank.")
 .set_body([](const char* param) { return new EvalPrecision("pre", param); });
 
-XGBOOST_REGISTER_METRIC(NDCG, "ndcg")
-.describe("ndcg@k for rank.")
-.set_body([](const char* param) { return new EvalNDCG("ndcg", param); });
-
 XGBOOST_REGISTER_METRIC(MAP, "map")
 .describe("map@k for rank.")
 .set_body([](const char* param) { return new EvalMAP("map", param); });
@@ -388,5 +377,148 @@ XGBOOST_REGISTER_METRIC(MAP, "map")
 XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
 .describe("Negative log partial likelihood of Cox proportional hazards model.")
 .set_body([](const char*) { return new EvalCox(); });
-}  // namespace metric
-}  // namespace xgboost
+
+// ranking metrics that requires cache
+template <typename Cache>
+class EvalRankWithCache : public Metric {
+ protected:
+  ltr::LambdaRankParam param_;
+  bool minus_{false};
+  std::string name_;
+
+  DMatrixCache<Cache> cache_{DMatrixCache<Cache>::DefaultSize()};
+
+ public:
+  EvalRankWithCache(StringView name, const char* param) {
+    auto constexpr kMax = ltr::LambdaRankParam::NotSet();
+    std::uint32_t topn{kMax};
+    this->name_ = ltr::ParseMetricName(name, param, &topn, &minus_);
+    if (topn != kMax) {
+      param_.UpdateAllowUnknown(Args{{"lambdarank_num_pair_per_sample", std::to_string(topn)},
+                                     {"lambdarank_pair_method", "topk"}});
+    }
+    param_.UpdateAllowUnknown(Args{});
+  }
+  void Configure(Args const&) override {
+    // do not configure, otherwise the ndcg param will be forced into the same as the one in
+    // objective.
+  }
+  void LoadConfig(Json const& in) override {
+    if (IsA<Null>(in)) {
+      return;
+    }
+    auto const& obj = get<Object const>(in);
+    auto it = obj.find("lambdarank_param");
+    if (it != obj.cend()) {
+      FromJson(it->second, &param_);
+    }
+  }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String{this->Name()};
+    out["lambdarank_param"] = ToJson(param_);
+  }
+
+  double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
+    auto const& info = p_fmat->Info();
+    auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
+    if (p_cache->Param() != param_) {
+      p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
+    }
+    CHECK(p_cache->Param() == param_);
+    CHECK_EQ(preds.Size(), info.labels.Size());
+
+    return this->Eval(preds, info, p_cache);
+  }
+
+  virtual double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
+                      std::shared_ptr<Cache> p_cache) = 0;
+};
+
+namespace {
+double Finalize(double score, double sw) {
+  std::array<double, 2> dat{score, sw};
+  collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
+  if (sw > 0.0) {
+    score = score / sw;
+  }
+
+  CHECK_LE(score, 1.0 + kRtEps)
+      << "Invalid output score, might be caused by invalid query group weight.";
+  score = std::min(1.0, score);
+
+  return score;
+}
+}  // namespace
+
+/**
+ * \brief Implement the NDCG score function for learning to rank.
+ *
+ *     Ties are ignored, which can lead to different result with other implementations.
+ */
+class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
+ public:
+  using EvalRankWithCache::EvalRankWithCache;
+  const char* Name() const override { return name_.c_str(); }
+
+  double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
+              std::shared_ptr<ltr::NDCGCache> p_cache) override {
+    if (ctx_->IsCUDA()) {
+      auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
+      return Finalize(ndcg.Residue(), ndcg.Weights());
+    }
+
+    // group local ndcg
+    auto group_ptr = p_cache->DataGroupPtr(ctx_);
+    bst_group_t n_groups = group_ptr.size() - 1;
+    auto ndcg_gloc = p_cache->Dcg(ctx_);
+    std::fill_n(ndcg_gloc.Values().data(), ndcg_gloc.Size(), 0.0);
+
+    auto h_inv_idcg = p_cache->InvIDCG(ctx_);
+    auto p_discount = p_cache->Discount(ctx_).data();
+
+    auto h_label = info.labels.HostView();
+    auto h_predt = linalg::MakeTensorView(ctx_, &preds, preds.Size());
+    auto weights = common::MakeOptionalWeights(ctx_, info.weights_);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
+      auto g_predt = h_predt.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
+      auto g_labels = h_label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]), 0);
+      auto sorted_idx = common::ArgSort<std::size_t>(ctx_, linalg::cbegin(g_predt),
+                                                     linalg::cend(g_predt), std::greater<>{});
+      double ndcg{.0};
+      double inv_idcg = h_inv_idcg(g);
+      if (inv_idcg <= 0.0) {
+        ndcg_gloc(g) = minus_ ? 0.0 : 1.0;
+        return;
+      }
+      std::size_t n{std::min(sorted_idx.size(), static_cast<std::size_t>(param_.TopK()))};
+      if (param_.ndcg_exp_gain) {
+        for (std::size_t i = 0; i < n; ++i) {
+          ndcg += p_discount[i] * ltr::CalcDCGGain(g_labels(sorted_idx[i])) * inv_idcg;
+        }
+      } else {
+        for (std::size_t i = 0; i < n; ++i) {
+          ndcg += p_discount[i] * g_labels(sorted_idx[i]) * inv_idcg;
+        }
+      }
+      ndcg_gloc(g) += ndcg * weights[g];
+    });
+    double sum_w{0};
+    if (weights.Empty()) {
+      sum_w = n_groups;
+    } else {
+      sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
+    }
+    auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
+    return Finalize(ndcg, sum_w);
+  }
+};
+
+XGBOOST_REGISTER_METRIC(EvalNDCG, "ndcg")
+    .describe("ndcg@k for ranking.")
+    .set_body([](char const* param) {
+      return new EvalNDCG{"ndcg", param};
+    });
+}  // namespace xgboost::metric
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index 5f98db7a9..4ab422a96 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -2,22 +2,29 @@
  * Copyright 2020-2023 by XGBoost Contributors
  */
 #include <dmlc/registry.h>
-#include <thrust/iterator/counting_iterator.h>  // make_counting_iterator
-#include <thrust/reduce.h>                      // reduce
-#include <xgboost/metric.h>
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/reduce.h>                      // for reduce
 
-#include <cstddef>                       // std::size_t
-#include <memory>                        // std::shared_ptr
+#include <algorithm>                            // for transform
+#include <cstddef>                              // for size_t
+#include <memory>                               // for shared_ptr
+#include <vector>                               // for vector
 
-#include "../common/cuda_context.cuh"    // CUDAContext
+#include "../common/cuda_context.cuh"           // for CUDAContext
+#include "../common/device_helpers.cuh"         // for MakeTransformIterator
+#include "../common/optional_weight.h"          // for MakeOptionalWeights
+#include "../common/ranking_utils.cuh"          // for CalcQueriesDCG, NDCGCache
 #include "metric_common.h"
-#include "xgboost/base.h"                // XGBOOST_DEVICE
-#include "xgboost/context.h"             // Context
-#include "xgboost/data.h"                // MetaInfo
-#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "rank_metric.h"
+#include "xgboost/base.h"                // for XGBOOST_DEVICE
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/linalg.h"              // for MakeTensorView
+#include "xgboost/logging.h"             // for CHECK
+#include "xgboost/metric.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(rank_metric_gpu);
 
@@ -117,81 +124,6 @@ struct EvalPrecisionGpu {
   }
 };
 
-/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
-struct EvalNDCGGpu {
- public:
-  static void ComputeDCG(const dh::SegmentSorter<float> &pred_sorter,
-                         const float *dlabels,
-                         const EvalRankConfig &ecfg,
-                         // The order in which labels have to be accessed. The order is determined
-                         // by sorting the predictions or the labels for the entire dataset
-                         const xgboost::common::Span<const uint32_t> &dlabels_sort_order,
-                         dh::caching_device_vector<double> *dcgptr) {
-    dh::caching_device_vector<double> &dcgs(*dcgptr);
-    // Group info on device
-    const auto &dgroups = pred_sorter.GetGroupsSpan();
-    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
-
-    // First, determine non zero labels in the dataset individually
-    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
-      return (static_cast<unsigned>(dlabels[dlabels_sort_order[idx]]));
-    };  // NOLINT
-
-    // Find each group's DCG value
-    const auto nitems = pred_sorter.GetNumItems();
-    auto *ddcgs = dcgs.data().get();
-
-    int device_id = -1;
-    dh::safe_cuda(cudaGetDevice(&device_id));
-
-    // For each group item compute the aggregated precision
-    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
-      const auto group_idx = dgroup_idx[idx];
-      const auto group_begin = dgroups[group_idx];
-      const auto ridx = idx - group_begin;
-      auto label = DetermineNonTrivialLabelLambda(idx);
-      if (ridx < ecfg.topn && label) {
-        atomicAdd(&ddcgs[group_idx], ((1 << label) - 1) / std::log2(ridx + 2.0));
-      }
-    });
-  }
-
-  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
-                           const float *dlabels,
-                           const EvalRankConfig &ecfg) {
-    // Sort the labels and compute IDCG
-    dh::SegmentSorter<float> segment_label_sorter;
-    segment_label_sorter.SortItems(dlabels, pred_sorter.GetNumItems(),
-                                   pred_sorter.GetGroupSegmentsSpan());
-
-    uint32_t ngroups = pred_sorter.GetNumGroups();
-
-    dh::caching_device_vector<double> idcg(ngroups, 0);
-    ComputeDCG(pred_sorter, dlabels, ecfg, segment_label_sorter.GetOriginalPositionsSpan(), &idcg);
-
-    // Compute the DCG values next
-    dh::caching_device_vector<double> dcg(ngroups, 0);
-    ComputeDCG(pred_sorter, dlabels, ecfg, pred_sorter.GetOriginalPositionsSpan(), &dcg);
-
-    double *ddcg = dcg.data().get();
-    double *didcg = idcg.data().get();
-
-    int device_id = -1;
-    dh::safe_cuda(cudaGetDevice(&device_id));
-    // Compute the group's DCG and reduce it across all groups
-    dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
-      if (didcg[gidx] == 0.0f) {
-        ddcg[gidx] = (ecfg.minus) ? 0.0f : 1.0f;
-      } else {
-        ddcg[gidx] /= didcg[gidx];
-      }
-    });
-
-    // Allocator to be used for managing space overhead while performing reductions
-    dh::XGBCachingDeviceAllocator<char> alloc;
-    return thrust::reduce(thrust::cuda::par(alloc), dcg.begin(), dcg.end());
-  }
-};
 
 /*! \brief Mean Average Precision at N, for both classification and rank */
 struct EvalMAPGpu {
@@ -272,12 +204,46 @@ XGBOOST_REGISTER_GPU_METRIC(PrecisionGpu, "pre")
 .describe("precision@k for rank computed on GPU.")
 .set_body([](const char* param) { return new EvalRankGpu<EvalPrecisionGpu>("pre", param); });
 
-XGBOOST_REGISTER_GPU_METRIC(NDCGGpu, "ndcg")
-.describe("ndcg@k for rank computed on GPU.")
-.set_body([](const char* param) { return new EvalRankGpu<EvalNDCGGpu>("ndcg", param); });
-
 XGBOOST_REGISTER_GPU_METRIC(MAPGpu, "map")
 .describe("map@k for rank computed on GPU.")
 .set_body([](const char* param) { return new EvalRankGpu<EvalMAPGpu>("map", param); });
-}  // namespace metric
-}  // namespace xgboost
+
+namespace cuda_impl {
+PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
+                             HostDeviceVector<float> const &predt, bool minus,
+                             std::shared_ptr<ltr::NDCGCache> p_cache) {
+  CHECK(p_cache);
+
+  auto const &p = p_cache->Param();
+  auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+  if (!d_weight.Empty()) {
+    CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
+  }
+  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  predt.SetDevice(ctx->gpu_id);
+  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
+
+  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
+  auto n_groups = info.group_ptr_.size() - 1;
+
+  auto d_inv_idcg = p_cache->InvIDCG(ctx);
+  auto d_sorted_idx = p_cache->SortedIdx(ctx, d_predt.Values());
+  auto d_out_dcg = p_cache->Dcg(ctx);
+
+  ltr::cuda_impl::CalcQueriesDCG(ctx, d_label, d_sorted_idx, p.ndcg_exp_gain, d_group_ptr, p.TopK(),
+                                 d_out_dcg);
+
+  auto it = dh::MakeTransformIterator<PackedReduceResult>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
+        if (d_inv_idcg(i) <= 0.0) {
+          return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[i])};
+        }
+        return PackedReduceResult{d_out_dcg(i) * d_inv_idcg(i) * d_weight[i],
+                                  static_cast<double>(d_weight[i])};
+      });
+  auto pair = thrust::reduce(ctx->CUDACtx()->CTP(), it, it + d_out_dcg.Size(),
+                             PackedReduceResult{0.0, 0.0});
+  return pair;
+}
+}  // namespace cuda_impl
+}  // namespace xgboost::metric
diff --git a/src/metric/rank_metric.h b/src/metric/rank_metric.h
new file mode 100644
index 000000000..0be0d4ee8
--- /dev/null
+++ b/src/metric/rank_metric.h
@@ -0,0 +1,33 @@
+#ifndef XGBOOST_METRIC_RANK_METRIC_H_
+#define XGBOOST_METRIC_RANK_METRIC_H_
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include <memory>                        // for shared_ptr
+
+#include "../common/common.h"            // for AssertGPUSupport
+#include "../common/ranking_utils.h"     // for NDCGCache
+#include "metric_common.h"               // for PackedReduceResult
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+
+namespace xgboost {
+namespace metric {
+namespace cuda_impl {
+PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
+                             HostDeviceVector<float> const &predt, bool minus,
+                             std::shared_ptr<ltr::NDCGCache> p_cache);
+
+#if !defined(XGBOOST_USE_CUDA)
+inline PackedReduceResult NDCGScore(Context const *, MetaInfo const &,
+                                    HostDeviceVector<float> const &, bool,
+                                    std::shared_ptr<ltr::NDCGCache>) {
+  common::AssertGPUSupport();
+  return {};
+}
+#endif
+}  // namespace cuda_impl
+}  // namespace metric
+}  // namespace xgboost
+#endif  // XGBOOST_METRIC_RANK_METRIC_H_
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index 1edbd9fc8..337ddbc8a 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -1,7 +1,20 @@
-// Copyright by Contributors
-#include <xgboost/metric.h>
+/**
+ * Copyright 2016-2023 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>                 // for Test, EXPECT_NEAR, ASSERT_STREQ
+#include <xgboost/context.h>             // for Context
+#include <xgboost/data.h>                // for MetaInfo, DMatrix
+#include <xgboost/linalg.h>              // for Matrix
+#include <xgboost/metric.h>              // for Metric
 
-#include "../helpers.h"
+#include <algorithm>                     // for max
+#include <memory>                        // for unique_ptr
+#include <vector>                        // for vector
+
+#include "../helpers.h"                  // for GetMetricEval, CreateEmptyGe...
+#include "xgboost/base.h"                // for bst_float, kRtEps
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/json.h"                // for Json, String, Object
 
 #if !defined(__CUDACC__)
 TEST(Metric, AMS) {
@@ -51,15 +64,17 @@ TEST(Metric, DeclareUnifiedTest(Precision)) {
   delete metric;
 }
 
+namespace xgboost {
+namespace metric {
 TEST(Metric, DeclareUnifiedTest(NDCG)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
   ASSERT_STREQ(metric->Name(), "ndcg");
   EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
-  EXPECT_NEAR(GetMetricEval(metric,
+  ASSERT_NEAR(GetMetricEval(metric,
                             xgboost::HostDeviceVector<xgboost::bst_float>{},
                             {}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
@@ -80,7 +95,7 @@ TEST(Metric, DeclareUnifiedTest(NDCG)) {
   EXPECT_NEAR(GetMetricEval(metric,
                             xgboost::HostDeviceVector<xgboost::bst_float>{},
                             {}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
@@ -91,29 +106,30 @@ TEST(Metric, DeclareUnifiedTest(NDCG)) {
   EXPECT_NEAR(GetMetricEval(metric,
                             xgboost::HostDeviceVector<xgboost::bst_float>{},
                             {}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
-              0.6509f, 0.001f);
+               0.6509f, 0.001f);
 
   delete metric;
   metric = xgboost::Metric::Create("ndcg@2-", &ctx);
   ASSERT_STREQ(metric->Name(), "ndcg@2-");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
-              0.3868f, 0.001f);
+              1.f - 0.3868f, 1.f - 0.001f);
 
   delete metric;
 }
 
 TEST(Metric, DeclareUnifiedTest(MAP)) {
   auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("map", &ctx);
+  Metric * metric = xgboost::Metric::Create("map", &ctx);
   ASSERT_STREQ(metric->Name(), "map");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, kRtEps);
+
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.1f, 0.9f},
                             {  0,   0,   1,   1}),
@@ -154,3 +170,39 @@ TEST(Metric, DeclareUnifiedTest(MAP)) {
               0.25f, 0.001f);
   delete metric;
 }
+
+TEST(Metric, DeclareUnifiedTest(NDCGExpGain)) {
+  Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
+  info.num_row_ = info.labels.Shape(0);
+  info.group_ptr_.resize(2);
+  info.group_ptr_[0] = 0;
+  info.group_ptr_[1] = info.num_row_;
+  HostDeviceVector<float> predt{{0.1f, 0.2f, 0.3f, 4.0f, 70.0f}};
+
+  std::unique_ptr<Metric> metric{Metric::Create("ndcg", &ctx)};
+  Json config{Object{}};
+  config["name"] = String{"ndcg"};
+  config["lambdarank_param"] = Object{};
+  config["lambdarank_param"]["ndcg_exp_gain"] = String{"true"};
+  config["lambdarank_param"]["lambdarank_num_pair_per_sample"] = String{"32"};
+  metric->LoadConfig(config);
+
+  auto ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 0.409738f, kRtEps);
+
+  config["lambdarank_param"]["ndcg_exp_gain"] = String{"false"};
+  metric->LoadConfig(config);
+
+  ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 0.695694f, kRtEps);
+
+  predt.HostVector() = info.labels.Data()->HostVector();
+  ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 1.0, kRtEps);
+}
+}  // namespace metric
+}  // namespace xgboost

From f186c87cf9f753d20864acf2de6e5fd9b25e027a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 15 Mar 2023 11:24:35 +0800
Subject: [PATCH 13/32] Check inf in data for all types of DMatrix. (#8911)

---
 .../dmlc/xgboost4j/gpu/java/BoosterTest.java  |  7 ++---
 .../spark/GpuXGBoostClassifierSuite.scala     | 23 ++++++++++------
 python-package/xgboost/testing/data.py        | 16 ++++++++++++
 src/common/error_msg.h                        |  4 +++
 src/data/data.cc                              | 11 +++++---
 src/data/device_adapter.cuh                   | 19 +++++++++++++-
 src/data/ellpack_page.cu                      | 26 ++++++++++---------
 src/data/gradient_index.h                     | 20 +++++++++-----
 src/data/simple_dmatrix.cuh                   | 24 ++++++++++-------
 .../test_device_quantile_dmatrix.py           |  7 +++++
 tests/python/test_quantile_dmatrix.py         |  6 ++++-
 11 files changed, 118 insertions(+), 45 deletions(-)

diff --git a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
index 49d17b6be..25705fd1b 100644
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -84,9 +84,10 @@ public class BoosterTest {
     };
 
     try (Table tmpTable = Table.readCSV(schema, opts, new File(trainingDataPath))) {
-      ColumnVector[] df = new ColumnVector[12];
-      for (int i = 0; i < 12; ++i) {
-        df[i] = tmpTable.getColumn(i);
+      ColumnVector[] df = new ColumnVector[10];
+      // exclude the first two columns, they are label bounds and contain inf.
+      for (int i = 2; i < 12; ++i) {
+        df[i - 2] = tmpTable.getColumn(i);
       }
       try (Table X = new Table(df);) {
         ColumnVector[] labels = new ColumnVector[1];
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
index fc26b2985..7e24fe0dd 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostClassifierSuite.scala
@@ -21,7 +21,7 @@ import java.io.File
 import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
 
 import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.{col, udf, when}
 import org.apache.spark.sql.types.{FloatType, StructField, StructType}
 
 class GpuXGBoostClassifierSuite extends GpuTestSuite {
@@ -47,7 +47,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
         "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
         "features_cols" -> featureNames, "label_col" -> labelName)
       val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
       // Get a model
       val model = new XGBoostClassifier(xgbParam)
         .fit(originalDf)
@@ -64,7 +65,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
         "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
         "features_cols" -> featureNames, "label_col" -> labelName)
       val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
       val getWeightFromF1 = udf({ f1: Float => if (f1.toInt % 2 == 0) 1.0f else 0.001f })
       val dfWithWeight = originalDf.withColumn("weight", getWeightFromF1(col("f1")))
 
@@ -87,7 +89,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
       val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
         "num_round" -> 10, "num_workers" -> 1)
       val Array(rawInput, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       val classifier = new XGBoostClassifier(xgbParam)
         .setFeaturesCol(featureNames)
@@ -122,7 +125,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
       val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
         "num_round" -> 10, "num_workers" -> 1)
       val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       val vectorAssembler = new VectorAssembler()
         .setHandleInvalid("keep")
@@ -144,7 +148,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
     // transform on GPU
     withGpuSparkSession() { spark =>
       val Array(_, testDf) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       // Since CPU model does not know the information about the features cols that GPU transform
       // pipeline requires. End user needs to setFeaturesCol(features: Array[String]) in the model
@@ -174,7 +179,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
       val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "binary:logistic",
         "num_round" -> 10, "num_workers" -> 1)
       val Array(rawInput, _) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       val classifier = new XGBoostClassifier(xgbParam)
         .setFeaturesCol(featureNames)
@@ -190,7 +196,8 @@ class GpuXGBoostClassifierSuite extends GpuTestSuite {
     // transform on CPU
     withCpuSparkSession() { spark =>
       val Array(_, rawInput) = spark.read.option("header", "true").schema(schema)
-        .csv(dataPath).randomSplit(Array(0.7, 0.3), seed = 1)
+        .csv(dataPath).withColumn("f2", when(col("f2").isin(Float.PositiveInfinity), 0))
+        .randomSplit(Array(0.7, 0.3), seed = 1)
 
       val featureColName = "feature_col"
       val vectorAssembler = new VectorAssembler()
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 4f79d7358..a9ea0019c 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -2,7 +2,10 @@
 from typing import Any, Generator, Tuple, Union
 
 import numpy as np
+import pytest
+from numpy.random import Generator as RNG
 
+import xgboost
 from xgboost.data import pandas_pyarrow_mapper
 
 
@@ -179,3 +182,16 @@ def pd_arrow_dtypes() -> Generator:
         dtype=pd.ArrowDtype(pa.bool_()),
     )
     yield orig, df
+
+
+def check_inf(rng: RNG) -> None:
+    """Validate there's no inf in X."""
+    X = rng.random(size=32).reshape(8, 4)
+    y = rng.random(size=8)
+    X[5, 2] = np.inf
+
+    with pytest.raises(ValueError, match="Input data contains `inf`"):
+        xgboost.QuantileDMatrix(X, y)
+
+    with pytest.raises(ValueError, match="Input data contains `inf`"):
+        xgboost.DMatrix(X, y)
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 48a2c92a4..484595316 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -20,5 +20,9 @@ constexpr StringView GroupSize() {
 constexpr StringView LabelScoreSize() {
   return "The size of label doesn't match the size of prediction.";
 }
+
+constexpr StringView InfInData() {
+  return "Input data contains `inf` while `missing` is not set to `inf`";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/data.cc b/src/data/data.cc
index d24048a2a..aa96a1bc8 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -10,13 +10,16 @@
 #include <cstring>
 
 #include "../collective/communicator-inl.h"
-#include "../common/algorithm.h"  // StableSort
-#include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
+#include "../collective/communicator.h"
+#include "../common/common.h"
+#include "../common/algorithm.h"  // for StableSort
+#include "../common/api_entry.h"  // for XGBAPIThreadLocalEntry
+#include "../common/error_msg.h"  // for InfInData
 #include "../common/group_data.h"
 #include "../common/io.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
-#include "../common/numeric.h"  // Iota
+#include "../common/numeric.h"  // for Iota
 #include "../common/threading_utils.h"
 #include "../common/version.h"
 #include "../data/adapter.h"
@@ -1144,7 +1147,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
     });
   }
   exec.Rethrow();
-  CHECK(valid) << "Input data contains `inf` or `nan`";
+  CHECK(valid) << error::InfInData();
   for (const auto & max : max_columns_vector) {
     max_columns = std::max(max_columns, max[0]);
   }
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 56c494dd1..494fb7d1c 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -4,7 +4,10 @@
  */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
-#include <cstddef>  // for size_t
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/logical.h>                     // for none_of
+
+#include <cstddef>                              // for size_t
 #include <limits>
 #include <memory>
 #include <string>
@@ -213,6 +216,20 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
                  static_cast<std::size_t>(0), thrust::maximum<size_t>());
   return row_stride;
 }
+
+/**
+ * \brief Check there's no inf in data.
+ */
+template <typename AdapterBatchT>
+bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
+  auto counting = thrust::make_counting_iterator(0llu);
+  auto value_iter = dh::MakeTransformIterator<float>(
+      counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
+  auto valid =
+      thrust::none_of(value_iter, value_iter + batch.Size(),
+                      [is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
+  return valid;
+}
 };  // namespace data
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_DEVICE_ADAPTER_H_
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 99e17d886..d631407a1 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 XGBoost contributors
+/**
+ * Copyright 2019-2023 by XGBoost contributors
  */
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
@@ -9,7 +9,7 @@
 #include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
 #include "./ellpack_page.cuh"
-#include "device_adapter.cuh"
+#include "device_adapter.cuh"  // for HasInfInData
 #include "gradient_index.h"
 #include "xgboost/data.h"
 
@@ -189,9 +189,8 @@ struct TupleScanOp {
 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
 template <typename AdapterBatchT>
-void CopyDataToEllpack(const AdapterBatchT &batch,
-                       common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl *dst, int device_idx, float missing) {
+void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
+                       EllpackPageImpl* dst, int device_idx, float missing) {
   // Some witchcraft happens here
   // The goal is to copy valid elements out of the input to an ELLPACK matrix
   // with a given row stride, using no extra working memory Standard stream
@@ -201,6 +200,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
   // correct output position
   auto counting = thrust::make_counting_iterator(0llu);
   data::IsValidFunctor is_valid(missing);
+  bool valid = data::HasInfInData(batch, is_valid);
+  CHECK(valid) << error::InfInData();
+
   auto key_iter = dh::MakeTransformIterator<size_t>(
       counting,
       [=] __device__(size_t idx) {
@@ -239,9 +241,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
       cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
                         TupleScanOp<Tuple>, cub::NullType, int64_t>;
 #if THRUST_MAJOR_VERSION >= 2
-  DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
-                         TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
-                         nullptr);
+  dh::safe_cuda(DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
+                                       TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
+                                       nullptr));
 #else
   DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
                          TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
@@ -249,9 +251,9 @@ void CopyDataToEllpack(const AdapterBatchT &batch,
 #endif
   dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
 #if THRUST_MAJOR_VERSION >= 2
-  DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
-                         key_value_index_iter, out, TupleScanOp<Tuple>(),
-                         cub::NullType(), batch.Size(), nullptr);
+  dh::safe_cuda(DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
+                                       key_value_index_iter, out, TupleScanOp<Tuple>(),
+                                       cub::NullType(), batch.Size(), nullptr));
 #else
   DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
                          key_value_index_iter, out, TupleScanOp<Tuple>(),
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 9eba9637f..3cb0709bd 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -1,21 +1,23 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
+/**
+ * Copyright 2017-2023 by XGBoost Contributors
  * \brief Data type for fast histogram aggregation.
  */
 #ifndef XGBOOST_DATA_GRADIENT_INDEX_H_
 #define XGBOOST_DATA_GRADIENT_INDEX_H_
 
-#include <algorithm>  // std::min
-#include <cinttypes>  // std::uint32_t
-#include <cstddef>    // std::size_t
+#include <algorithm>  // for min
+#include <atomic>     // for atomic
+#include <cinttypes>  // for uint32_t
+#include <cstddef>    // for size_t
 #include <memory>
 #include <vector>
 
 #include "../common/categorical.h"
+#include "../common/error_msg.h"  // for InfInData
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
 #include "../common/threading_utils.h"
-#include "../common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
 #include "proxy_dmatrix.h"
 #include "xgboost/base.h"
@@ -62,6 +64,7 @@ class GHistIndexMatrix {
     BinIdxType* index_data = index_data_span.data();
     auto const& ptrs = cut.Ptrs();
     auto const& values = cut.Values();
+    std::atomic<bool> valid{true};
     common::ParallelFor(batch_size, batch_threads, [&](size_t i) {
       auto line = batch.GetLine(i);
       size_t ibegin = row_ptr[rbegin + i];  // index of first entry for current block
@@ -70,6 +73,9 @@ class GHistIndexMatrix {
       for (size_t j = 0; j < line.Size(); ++j) {
         data::COOTuple elem = line.GetElement(j);
         if (is_valid(elem)) {
+          if (XGBOOST_EXPECT((std::isinf(elem.value)), false)) {
+            valid = false;
+          }
           bst_bin_t bin_idx{-1};
           if (common::IsCat(ft, elem.column_idx)) {
             bin_idx = cut.SearchCatBin(elem.value, elem.column_idx, ptrs, values);
@@ -82,6 +88,8 @@ class GHistIndexMatrix {
         }
       }
     });
+
+    CHECK(valid) << error::InfInData();
   }
 
   // Gather hit_count from all threads
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index c71a52b67..63310a929 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -1,18 +1,19 @@
-/*!
- * Copyright 2019-2021 by XGBoost Contributors
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
  * \file simple_dmatrix.cuh
  */
 #ifndef XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
 #define XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
 
 #include <thrust/copy.h>
-#include <thrust/scan.h>
 #include <thrust/execution_policy.h>
-#include "device_adapter.cuh"
-#include "../common/device_helpers.cuh"
+#include <thrust/scan.h>
 
-namespace xgboost {
-namespace data {
+#include "../common/device_helpers.cuh"
+#include "../common/error_msg.h"  // for InfInData
+#include "device_adapter.cuh"     // for HasInfInData
+
+namespace xgboost::data {
 
 template <typename AdapterBatchT>
 struct COOToEntryOp {
@@ -61,7 +62,11 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
 }
 
 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing, SparsePage* page) {
+size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
+                        SparsePage* page) {
+  bool valid = HasInfInData(batch, IsValidFunctor{missing});
+  CHECK(valid) << error::InfInData();
+
   page->offset.SetDevice(device);
   page->data.SetDevice(device);
   page->offset.Resize(batch.NumRows() + 1);
@@ -73,6 +78,5 @@ size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missin
 
   return num_nonzero_;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index 0250cea3f..3cd65e30f 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -6,6 +6,7 @@ from hypothesis import given, settings, strategies
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import check_inf
 
 sys.path.append("tests/python")
 import test_quantile_dmatrix as tqd
@@ -153,3 +154,9 @@ class TestQuantileDMatrix:
         from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
 
         assert tm.predictor_equal(from_qdm, from_dm)
+
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_check_inf(self) -> None:
+        import cupy as cp
+        rng = cp.random.default_rng(1994)
+        check_inf(rng)
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 316d0e5f6..537910725 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -15,7 +15,7 @@ from xgboost.testing import (
     make_sparse_regression,
     predictor_equal,
 )
-from xgboost.testing.data import np_dtypes
+from xgboost.testing.data import check_inf, np_dtypes
 
 
 class TestQuantileDMatrix:
@@ -244,6 +244,10 @@ class TestQuantileDMatrix:
         from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
         assert predictor_equal(from_qdm, from_dm)
 
+    def test_check_inf(self) -> None:
+        rng = np.random.default_rng(1994)
+        check_inf(rng)
+
     # we don't test empty Quantile DMatrix in single node construction.
     @given(
         strategies.integers(1, 1000),

From fd016e43c695f7d38f0b3edcf8b1d235081a133e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 15 Mar 2023 18:51:46 +0800
Subject: [PATCH 14/32] Bump maven-surefire-plugin from 2.22.2 to 3.0.0 in
 /jvm-packages (#8917)

Bumps [maven-surefire-plugin](https://github.com/apache/maven-surefire) from 2.22.2 to 3.0.0.
- [Release notes](https://github.com/apache/maven-surefire/releases)
- [Commits](https://github.com/apache/maven-surefire/compare/surefire-2.22.2...surefire-3.0.0)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-surefire-plugin
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 852cf7f69..b97eccc01 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -427,7 +427,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
-                <version>2.22.2</version>
+                <version>3.0.0</version>
                 <configuration>
                     <skipTests>false</skipTests>
                     <useSystemClassLoader>false</useSystemClassLoader>

From a2cdba51ce0ce647a81f1850e002b0dbbca2a2ea Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 15 Mar 2023 10:02:38 -0700
Subject: [PATCH 15/32] Use hi-res SVG logo (#8923)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 219831114..2fae68ac5 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/xgboost.png width=135/>  eXtreme Gradient Boosting
+<img src="https://xgboost.ai/images/logo/xgboost-logo.svg" width=135/>  eXtreme Gradient Boosting
 ===========
 [![Build Status](https://xgboost-ci.net/job/xgboost/job/master/badge/icon)](https://xgboost-ci.net/blue/organizations/jenkins/xgboost/activity)
 [![XGBoost-CI](https://github.com/dmlc/xgboost/workflows/XGBoost-CI/badge.svg?branch=master)](https://github.com/dmlc/xgboost/actions)

From 26209a42a506ebe1193b27a53690791aa07a7f7b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 16 Mar 2023 02:43:11 +0800
Subject: [PATCH 16/32] Define git attributes for renormalization. (#8921)

---
 .gitattributes                                |  18 +
 jvm-packages/xgboost4j-example/README.md      |  60 +-
 .../src/test/resources/rank.test.csv          | 132 +--
 .../src/test/resources/rank.train.csv         | 298 +++---
 plugin/updater_oneapi/predictor_oneapi.cc     | 894 +++++++++---------
 .../updater_oneapi/regression_loss_oneapi.h   | 290 +++---
 .../updater_oneapi/regression_obj_oneapi.cc   | 364 +++----
 src/common/partition_builder.h                | 782 +++++++--------
 src/tree/driver.h                             | 222 ++---
 tests/cpp/common/test_partition_builder.cc    | 158 ++--
 10 files changed, 1618 insertions(+), 1600 deletions(-)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 000000000..5c71e130e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,18 @@
+* text=auto
+
+*.c   text eol=lf
+*.h   text eol=lf
+*.cc  text eol=lf
+*.cuh text eol=lf
+*.cu  text eol=lf
+*.py  text eol=lf
+*.txt text eol=lf
+*.R   text eol=lf
+*.scala text eol=lf
+*.java  text eol=lf
+
+*.sh text eol=lf
+
+*.rst text eol=lf
+*.md  text eol=lf
+*.csv text eol=lf
\ No newline at end of file
diff --git a/jvm-packages/xgboost4j-example/README.md b/jvm-packages/xgboost4j-example/README.md
index 4718f212f..50f268e83 100644
--- a/jvm-packages/xgboost4j-example/README.md
+++ b/jvm-packages/xgboost4j-example/README.md
@@ -1,30 +1,30 @@
-XGBoost4J Code Examples
-=======================
-
-## Java API
-* [Basic walkthrough of wrappers](src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java)
-* [Customize loss function, and evaluation metric](src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java)
-* [Boosting from existing prediction](src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java)
-* [Predicting using first n trees](src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java)
-* [Generalized Linear Model](src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java)
-* [Cross validation](src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java)
-* [Predicting leaf indices](src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java)
-* [External Memory](src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java)
-* [Early Stopping](src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java)
-
-## Scala API
-
-* [Basic walkthrough of wrappers](src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala)
-* [Customize loss function, and evaluation metric](src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala)
-* [Boosting from existing prediction](src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala)
-* [Predicting using first n trees](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala)
-* [Generalized Linear Model](src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala)
-* [Cross validation](src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala)
-* [Predicting leaf indices](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala)
-* [External Memory](src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala)
-
-## Spark API
-* [Distributed Training with Spark](src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala)
-
-## Flink API
-* [Distributed Training with Flink](src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala)
+XGBoost4J Code Examples
+=======================
+
+## Java API
+* [Basic walkthrough of wrappers](src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java)
+* [Customize loss function, and evaluation metric](src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java)
+* [Boosting from existing prediction](src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java)
+* [Predicting using first n trees](src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java)
+* [Generalized Linear Model](src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java)
+* [Cross validation](src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java)
+* [Predicting leaf indices](src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java)
+* [External Memory](src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java)
+* [Early Stopping](src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java)
+
+## Scala API
+
+* [Basic walkthrough of wrappers](src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala)
+* [Customize loss function, and evaluation metric](src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala)
+* [Boosting from existing prediction](src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala)
+* [Predicting using first n trees](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala)
+* [Generalized Linear Model](src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala)
+* [Cross validation](src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala)
+* [Predicting leaf indices](src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala)
+* [External Memory](src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala)
+
+## Spark API
+* [Distributed Training with Spark](src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala)
+
+## Flink API
+* [Distributed Training with Flink](src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala)
diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/rank.test.csv b/jvm-packages/xgboost4j-spark/src/test/resources/rank.test.csv
index 83bf8b080..729732e5b 100644
--- a/jvm-packages/xgboost4j-spark/src/test/resources/rank.test.csv
+++ b/jvm-packages/xgboost4j-spark/src/test/resources/rank.test.csv
@@ -1,66 +1,66 @@
-0,10.0229017899,7.30178495562,0.118115020017,1
-0,9.93639621859,9.93102159291,0.0435030004396,1
-0,10.1301737265,0.00411765220572,2.4165878053,1
-1,9.87828587087,0.608588414992,0.111262590883,1
-0,10.1373430048,0.47764012225,0.991553052194,1
-0,10.0523814718,4.72152505167,0.672978832666,1
-0,10.0449715742,8.40373928536,0.384457573667,1
-1,996.398498791,941.976309154,0.230269231292,2
-0,1005.11269468,900.093680877,0.265031528873,2
-0,997.160349441,891.331101688,2.19362017313,2
-0,993.754139031,44.8000165317,1.03868009875,2
-1,994.831299184,241.959208453,0.667631827024,2
-0,995.948333283,7.94326917112,0.750490877118,3
-0,989.733981273,7.52077625436,0.0126335967282,3
-0,1003.54086516,6.48177510564,1.19441696788,3
-0,996.56177804,9.71959812613,1.33082465111,3
-0,1005.61382467,0.234339369309,1.17987797356,3
-1,980.215758708,6.85554542926,2.63965085259,3
-1,987.776408872,2.23354609991,0.841885278028,3
-0,1006.54260396,8.12142049834,2.26639471174,3
-0,1009.87927639,6.40028519044,0.775155669615,3
-0,9.95006244393,928.76896718,234.948458244,4
-1,10.0749152258,255.294574476,62.9728604166,4
-1,10.1916541988,312.682867085,92.299413677,4
-0,9.95646724484,742.263188416,53.3310473654,4
-0,9.86211293222,996.237023866,2.00760301168,4
-1,9.91801019468,303.971783709,50.3147230679,4
-0,996.983996934,9.52188222766,1.33588120981,5
-0,995.704388126,9.49260524915,0.908498516541,5
-0,987.86480767,0.0870786716821,0.108859297837,5
-0,1000.99561307,2.85272694575,0.171134518956,5
-0,1011.05508066,7.55336771768,1.04950084825,5
-1,985.52199365,0.763305780608,1.7402424375,5
-0,10.0430321467,813.185427181,4.97728254185,6
-0,10.0812334228,258.297288417,0.127477670549,6
-0,9.84210504292,887.205815261,0.991689193955,6
-1,9.94625332613,0.298622762132,0.147881353231,6
-0,9.97800659954,727.619819757,0.0718361141866,6
-1,9.8037938472,957.385549617,0.0618862028941,6
-0,10.0880634741,185.024638577,1.7028095095,6
-0,9.98630799154,109.10631473,0.681117359751,6
-0,9.91671416638,166.248076588,122.538291094,7
-0,10.1206910464,88.1539468531,141.189859069,7
-1,10.1767160518,1.02960996847,172.02256237,7
-0,9.93025147233,391.196641942,58.040338247,7
-0,9.84850936037,474.63346537,17.5627875397,7
-1,9.8162731343,61.9199554213,30.6740972851,7
-0,10.0403482984,987.50416929,73.0472906209,7
-1,997.019228359,133.294717663,0.0572254083186,8
-0,973.303999107,1.79080888849,0.100478717048,8
-0,1008.28808825,342.282350685,0.409806485495,8
-0,1014.55621524,0.680510407082,0.929530602495,8
-1,1012.74370325,823.105266455,0.0894693730585,8
-0,1003.63554038,727.334432075,0.58206275756,8
-0,10.1560432436,740.35938307,11.6823378533,9
-0,9.83949099701,512.828227154,138.206666681,9
-1,10.1837395682,179.287126088,185.479062365,9
-1,9.9761881495,12.1093388336,9.1264604171,9
-1,9.77402180766,318.561317743,80.6005221355,9
-0,1011.15705381,0.215825852155,1.34429667906,10
-0,1005.60353229,727.202346126,1.47146041005,10
-1,1013.93702961,58.7312725205,0.421041560754,10
-0,1004.86813074,757.693204258,0.566055205344,10
-0,999.996324692,813.12386828,0.864428279513,10
-0,996.55255931,918.760056995,0.43365051974,10
-1,1004.1394132,464.371823646,0.312492288321,10
+0,10.0229017899,7.30178495562,0.118115020017,1
+0,9.93639621859,9.93102159291,0.0435030004396,1
+0,10.1301737265,0.00411765220572,2.4165878053,1
+1,9.87828587087,0.608588414992,0.111262590883,1
+0,10.1373430048,0.47764012225,0.991553052194,1
+0,10.0523814718,4.72152505167,0.672978832666,1
+0,10.0449715742,8.40373928536,0.384457573667,1
+1,996.398498791,941.976309154,0.230269231292,2
+0,1005.11269468,900.093680877,0.265031528873,2
+0,997.160349441,891.331101688,2.19362017313,2
+0,993.754139031,44.8000165317,1.03868009875,2
+1,994.831299184,241.959208453,0.667631827024,2
+0,995.948333283,7.94326917112,0.750490877118,3
+0,989.733981273,7.52077625436,0.0126335967282,3
+0,1003.54086516,6.48177510564,1.19441696788,3
+0,996.56177804,9.71959812613,1.33082465111,3
+0,1005.61382467,0.234339369309,1.17987797356,3
+1,980.215758708,6.85554542926,2.63965085259,3
+1,987.776408872,2.23354609991,0.841885278028,3
+0,1006.54260396,8.12142049834,2.26639471174,3
+0,1009.87927639,6.40028519044,0.775155669615,3
+0,9.95006244393,928.76896718,234.948458244,4
+1,10.0749152258,255.294574476,62.9728604166,4
+1,10.1916541988,312.682867085,92.299413677,4
+0,9.95646724484,742.263188416,53.3310473654,4
+0,9.86211293222,996.237023866,2.00760301168,4
+1,9.91801019468,303.971783709,50.3147230679,4
+0,996.983996934,9.52188222766,1.33588120981,5
+0,995.704388126,9.49260524915,0.908498516541,5
+0,987.86480767,0.0870786716821,0.108859297837,5
+0,1000.99561307,2.85272694575,0.171134518956,5
+0,1011.05508066,7.55336771768,1.04950084825,5
+1,985.52199365,0.763305780608,1.7402424375,5
+0,10.0430321467,813.185427181,4.97728254185,6
+0,10.0812334228,258.297288417,0.127477670549,6
+0,9.84210504292,887.205815261,0.991689193955,6
+1,9.94625332613,0.298622762132,0.147881353231,6
+0,9.97800659954,727.619819757,0.0718361141866,6
+1,9.8037938472,957.385549617,0.0618862028941,6
+0,10.0880634741,185.024638577,1.7028095095,6
+0,9.98630799154,109.10631473,0.681117359751,6
+0,9.91671416638,166.248076588,122.538291094,7
+0,10.1206910464,88.1539468531,141.189859069,7
+1,10.1767160518,1.02960996847,172.02256237,7
+0,9.93025147233,391.196641942,58.040338247,7
+0,9.84850936037,474.63346537,17.5627875397,7
+1,9.8162731343,61.9199554213,30.6740972851,7
+0,10.0403482984,987.50416929,73.0472906209,7
+1,997.019228359,133.294717663,0.0572254083186,8
+0,973.303999107,1.79080888849,0.100478717048,8
+0,1008.28808825,342.282350685,0.409806485495,8
+0,1014.55621524,0.680510407082,0.929530602495,8
+1,1012.74370325,823.105266455,0.0894693730585,8
+0,1003.63554038,727.334432075,0.58206275756,8
+0,10.1560432436,740.35938307,11.6823378533,9
+0,9.83949099701,512.828227154,138.206666681,9
+1,10.1837395682,179.287126088,185.479062365,9
+1,9.9761881495,12.1093388336,9.1264604171,9
+1,9.77402180766,318.561317743,80.6005221355,9
+0,1011.15705381,0.215825852155,1.34429667906,10
+0,1005.60353229,727.202346126,1.47146041005,10
+1,1013.93702961,58.7312725205,0.421041560754,10
+0,1004.86813074,757.693204258,0.566055205344,10
+0,999.996324692,813.12386828,0.864428279513,10
+0,996.55255931,918.760056995,0.43365051974,10
+1,1004.1394132,464.371823646,0.312492288321,10
diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv b/jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv
index ebe232b51..bec3b034c 100644
--- a/jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv
+++ b/jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv
@@ -1,149 +1,149 @@
-0,985.574005058,320.223538037,0.621236086198,1
-0,1010.52917943,635.535543082,2.14984030531,1
-0,1012.91900422,132.387300057,0.488761066665,1
-0,990.829194034,135.102081162,0.747701610673,1
-0,1007.05103629,154.289183562,0.464118249201,1
-0,994.9573036,317.483732878,0.0313685555674,1
-0,987.8071541,731.349178363,0.244616944245,1
-1,10.0349544469,2.29750906143,36.4949974282,2
-0,9.92953881383,5.39134047297,120.041297548,2
-0,10.0909866713,9.06191026312,138.807825798,2
-1,10.2090970614,0.0784495944448,58.207703565,2
-0,9.85695905893,9.99500727713,56.8610243778,2
-1,10.0805758547,0.0410805760559,222.102302076,2
-0,10.1209914486,9.9729127088,171.888238763,2
-0,10.0331939798,0.853339303793,311.181328375,3
-0,9.93901762951,2.72757449146,78.4859514413,3
-0,10.0752365346,9.18695328235,49.8520256553,3
-1,10.0456548902,0.270936043122,123.462958597,3
-0,10.0568923673,0.82997113263,44.9391426001,3
-0,9.8214143472,0.277538931578,15.4217659578,3
-0,9.95258604431,8.69564346094,255.513470671,3
-0,9.91934976357,7.72809741413,82.171591817,3
-0,10.043239582,8.64168255553,38.9657919329,3
-1,10.0236147929,0.0496662263659,4.40889812286,3
-1,1001.85585324,3.75646886071,0.0179224994842,4
-0,1014.25578571,0.285765311201,0.510329864983,4
-1,1002.81422786,9.77676280375,0.433705951912,4
-1,998.072711553,2.82100686538,0.889829076909,4
-0,1003.77395036,2.55916592114,0.0359402151496,4
-1,10.0807877782,4.98513959013,47.5266363559,5
-0,10.0015013081,9.94302478763,78.3697486277,5
-1,10.0441936789,0.305091816635,56.8213984987,5
-0,9.94257106618,7.23909568913,442.463339039,5
-1,9.86479307916,6.41701315844,55.1365304834,5
-0,10.0428628516,9.98466447697,0.391632812588,5
-0,9.94445884566,9.99970945878,260.438436534,5
-1,9.84641392823,225.78051312,1.00525978847,6
-1,9.86907690608,26.8971083147,0.577959255991,6
-0,10.0177314626,0.110585342313,2.30545043031,6
-0,10.0688190907,412.023866234,1.22421542264,6
-0,10.1251769646,13.8212202925,0.129171734504,6
-0,10.0840758802,407.359097187,0.477000870705,6
-0,10.1007458705,987.183625145,0.149385677415,6
-0,9.86472656059,169.559640615,0.147221652519,6
-0,9.94207419238,507.290053755,0.41996207214,6
-0,9.9671005502,1.62610457716,0.408173666788,6
-0,1010.57126596,9.06673707562,0.672092284372,7
-0,1001.6718262,9.53203990055,4.7364050044,7
-0,995.777341384,4.43847316256,2.07229073634,7
-0,1002.95701386,5.51711016665,1.24294450546,7
-0,1016.0988238,0.626468941906,0.105627919134,7
-0,1013.67571419,0.042315529666,0.717619310322,7
-1,994.747747892,6.01989364024,0.772910130015,7
-1,991.654593872,7.35575736952,1.19822091548,7
-0,1008.47101732,8.28240754909,0.229582481359,7
-0,1000.81975227,1.52448354056,0.096441660362,7
-0,10.0900922344,322.656649307,57.8149073088,8
-1,10.0868337371,2.88652339174,54.8865514572,8
-0,10.0988984137,979.483832657,52.6809830901,8
-0,9.97678959238,665.770979738,481.069628909,8
-0,9.78554312773,257.309358658,47.7324475232,8
-0,10.0985967566,935.896512941,138.937052808,8
-0,10.0522252319,876.376299607,6.00373510669,8
-1,9.88065229501,9.99979825653,0.0674603696149,9
-0,10.0483244098,0.0653852316381,0.130679349938,9
-1,9.99685215607,1.76602542774,0.2551321159,9
-0,9.99750159428,1.01591534436,0.145445506504,9
-1,9.97380908941,0.940048645571,0.411805696316,9
-0,9.99977678382,6.91329929641,5.57858201258,9
-0,978.876096381,933.775364741,0.579170824236,10
-0,998.381016406,220.940470582,2.01491778565,10
-0,987.917644594,8.74667873567,0.364006099758,10
-0,1000.20994892,25.2945450565,3.5684398964,10
-0,1014.57141264,675.593540733,0.164174055535,10
-0,998.867283535,765.452750642,0.818425293238,10
-0,10.2143092481,273.576539531,137.111774354,11
-0,10.0366658918,842.469052609,2.32134375927,11
-0,10.1281202091,395.654057342,35.4184893063,11
-0,10.1443721289,960.058461049,272.887070637,11
-0,10.1353234784,535.51304462,2.15393842032,11
-1,10.0451640374,216.733858424,55.6533298016,11
-1,9.94254592171,44.5985537358,304.614176871,11
-0,10.1319257181,613.545504487,5.42391587912,11
-0,1020.63622468,997.476744201,0.509425590461,12
-0,986.304585519,822.669937965,0.605133561808,12
-1,1012.66863221,26.7185759069,0.0875458784828,12
-0,995.387656321,81.8540176995,0.691999430068,12
-0,1020.6587198,848.826964547,0.540159430526,12
-1,1003.81573853,379.84350931,0.0083682925194,12
-0,1021.60921516,641.376951467,1.12339054807,12
-0,1000.17585041,122.107138713,1.09906375372,12
-1,987.64802348,5.98448541152,0.124241987204,12
-1,9.94610136583,346.114985897,0.387708236565,13
-0,9.96812192337,313.278109696,0.00863026595671,13
-0,10.0181739194,36.7378924562,2.92179879835,13
-0,9.89000102695,164.273723971,0.685222591968,13
-0,10.1555212436,320.451459462,2.01341536261,13
-0,10.0085727613,999.767117646,0.462294934168,13
-1,9.93099658724,5.17478203909,0.213855205032,13
-0,10.0629454957,663.088181857,0.049022351462,13
-0,10.1109732417,734.904569784,1.6998450094,13
-0,1006.6015266,505.023453703,1.90870566777,14
-0,991.865769489,245.437343115,0.475109744256,14
-0,998.682734072,950.041057232,1.9256314201,14
-0,1005.02207209,2.9619314197,0.0517146822357,14
-0,1002.54526214,860.562681899,0.915687092848,14
-0,1000.38847359,808.416525088,0.209690673808,14
-1,992.557818382,373.889409453,0.107571728577,14
-0,1002.07722137,997.329626371,1.06504260496,14
-0,1000.40504333,949.832139189,0.539159980327,14
-0,10.1460179902,8.86082969819,135.953842715,15
-1,9.98529296553,2.87366448495,1.74249892194,15
-0,9.88942676744,9.4031821056,149.473066381,15
-1,10.0192953341,1.99685737576,1.79502473397,15
-0,10.0110654379,8.13112593726,87.7765628103,15
-0,997.148677047,733.936190093,1.49298494242,16
-0,1008.70465919,957.121652078,0.217414013634,16
-1,997.356154278,541.599587807,0.100855972216,16
-0,999.615897283,943.700501824,0.862874175879,16
-1,997.36859077,0.200859940848,0.13601892182,16
-0,10.0423255624,1.73855202168,0.956695338485,17
-1,9.88440755486,9.9994600678,0.305080529665,17
-0,10.0891026412,3.28031719474,0.364450973697,17
-0,9.90078644258,8.77839663617,0.456660574479,17
-1,9.79380029711,8.77220326156,0.527292005175,17
-0,9.93613887011,9.76270841268,1.40865693823,17
-0,10.0009239007,7.29056178263,0.498015866607,17
-0,9.96603319905,5.12498000925,0.517492532783,17
-0,10.0923827222,2.76652583955,1.56571226159,17
-1,10.0983782035,587.788120694,0.031756483687,18
-1,9.91397225464,994.527496819,3.72092164978,18
-0,10.1057472738,2.92894440088,0.683506438532,18
-0,10.1014053354,959.082038017,1.07039624129,18
-0,10.1433253044,322.515119317,0.51408278993,18
-1,9.82832510699,637.104433908,0.250272776427,18
-0,1000.49729075,2.75336888111,0.576634423274,19
-1,984.90338088,0.0295435794035,1.26273339929,19
-0,1001.53811442,4.64164410861,0.0293389959504,19
-1,995.875898395,5.08223403205,0.382330566779,19
-0,996.405937252,6.26395190757,0.453645816611,19
-0,10.0165140779,340.126072514,0.220794603312,20
-0,9.93482824816,951.672000448,0.124406293612,20
-0,10.1700278554,0.0140985961008,0.252452256311,20
-0,9.99825079542,950.382643896,0.875382402062,20
-0,9.87316410028,686.788257829,0.215886999825,20
-0,10.2893240654,89.3947931451,0.569578232133,20
-0,9.98689192703,0.430107535413,2.99869831728,20
-0,10.1365175107,972.279245093,0.0865099386744,20
-0,9.90744703306,50.810461183,3.00863325197,20
+0,985.574005058,320.223538037,0.621236086198,1
+0,1010.52917943,635.535543082,2.14984030531,1
+0,1012.91900422,132.387300057,0.488761066665,1
+0,990.829194034,135.102081162,0.747701610673,1
+0,1007.05103629,154.289183562,0.464118249201,1
+0,994.9573036,317.483732878,0.0313685555674,1
+0,987.8071541,731.349178363,0.244616944245,1
+1,10.0349544469,2.29750906143,36.4949974282,2
+0,9.92953881383,5.39134047297,120.041297548,2
+0,10.0909866713,9.06191026312,138.807825798,2
+1,10.2090970614,0.0784495944448,58.207703565,2
+0,9.85695905893,9.99500727713,56.8610243778,2
+1,10.0805758547,0.0410805760559,222.102302076,2
+0,10.1209914486,9.9729127088,171.888238763,2
+0,10.0331939798,0.853339303793,311.181328375,3
+0,9.93901762951,2.72757449146,78.4859514413,3
+0,10.0752365346,9.18695328235,49.8520256553,3
+1,10.0456548902,0.270936043122,123.462958597,3
+0,10.0568923673,0.82997113263,44.9391426001,3
+0,9.8214143472,0.277538931578,15.4217659578,3
+0,9.95258604431,8.69564346094,255.513470671,3
+0,9.91934976357,7.72809741413,82.171591817,3
+0,10.043239582,8.64168255553,38.9657919329,3
+1,10.0236147929,0.0496662263659,4.40889812286,3
+1,1001.85585324,3.75646886071,0.0179224994842,4
+0,1014.25578571,0.285765311201,0.510329864983,4
+1,1002.81422786,9.77676280375,0.433705951912,4
+1,998.072711553,2.82100686538,0.889829076909,4
+0,1003.77395036,2.55916592114,0.0359402151496,4
+1,10.0807877782,4.98513959013,47.5266363559,5
+0,10.0015013081,9.94302478763,78.3697486277,5
+1,10.0441936789,0.305091816635,56.8213984987,5
+0,9.94257106618,7.23909568913,442.463339039,5
+1,9.86479307916,6.41701315844,55.1365304834,5
+0,10.0428628516,9.98466447697,0.391632812588,5
+0,9.94445884566,9.99970945878,260.438436534,5
+1,9.84641392823,225.78051312,1.00525978847,6
+1,9.86907690608,26.8971083147,0.577959255991,6
+0,10.0177314626,0.110585342313,2.30545043031,6
+0,10.0688190907,412.023866234,1.22421542264,6
+0,10.1251769646,13.8212202925,0.129171734504,6
+0,10.0840758802,407.359097187,0.477000870705,6
+0,10.1007458705,987.183625145,0.149385677415,6
+0,9.86472656059,169.559640615,0.147221652519,6
+0,9.94207419238,507.290053755,0.41996207214,6
+0,9.9671005502,1.62610457716,0.408173666788,6
+0,1010.57126596,9.06673707562,0.672092284372,7
+0,1001.6718262,9.53203990055,4.7364050044,7
+0,995.777341384,4.43847316256,2.07229073634,7
+0,1002.95701386,5.51711016665,1.24294450546,7
+0,1016.0988238,0.626468941906,0.105627919134,7
+0,1013.67571419,0.042315529666,0.717619310322,7
+1,994.747747892,6.01989364024,0.772910130015,7
+1,991.654593872,7.35575736952,1.19822091548,7
+0,1008.47101732,8.28240754909,0.229582481359,7
+0,1000.81975227,1.52448354056,0.096441660362,7
+0,10.0900922344,322.656649307,57.8149073088,8
+1,10.0868337371,2.88652339174,54.8865514572,8
+0,10.0988984137,979.483832657,52.6809830901,8
+0,9.97678959238,665.770979738,481.069628909,8
+0,9.78554312773,257.309358658,47.7324475232,8
+0,10.0985967566,935.896512941,138.937052808,8
+0,10.0522252319,876.376299607,6.00373510669,8
+1,9.88065229501,9.99979825653,0.0674603696149,9
+0,10.0483244098,0.0653852316381,0.130679349938,9
+1,9.99685215607,1.76602542774,0.2551321159,9
+0,9.99750159428,1.01591534436,0.145445506504,9
+1,9.97380908941,0.940048645571,0.411805696316,9
+0,9.99977678382,6.91329929641,5.57858201258,9
+0,978.876096381,933.775364741,0.579170824236,10
+0,998.381016406,220.940470582,2.01491778565,10
+0,987.917644594,8.74667873567,0.364006099758,10
+0,1000.20994892,25.2945450565,3.5684398964,10
+0,1014.57141264,675.593540733,0.164174055535,10
+0,998.867283535,765.452750642,0.818425293238,10
+0,10.2143092481,273.576539531,137.111774354,11
+0,10.0366658918,842.469052609,2.32134375927,11
+0,10.1281202091,395.654057342,35.4184893063,11
+0,10.1443721289,960.058461049,272.887070637,11
+0,10.1353234784,535.51304462,2.15393842032,11
+1,10.0451640374,216.733858424,55.6533298016,11
+1,9.94254592171,44.5985537358,304.614176871,11
+0,10.1319257181,613.545504487,5.42391587912,11
+0,1020.63622468,997.476744201,0.509425590461,12
+0,986.304585519,822.669937965,0.605133561808,12
+1,1012.66863221,26.7185759069,0.0875458784828,12
+0,995.387656321,81.8540176995,0.691999430068,12
+0,1020.6587198,848.826964547,0.540159430526,12
+1,1003.81573853,379.84350931,0.0083682925194,12
+0,1021.60921516,641.376951467,1.12339054807,12
+0,1000.17585041,122.107138713,1.09906375372,12
+1,987.64802348,5.98448541152,0.124241987204,12
+1,9.94610136583,346.114985897,0.387708236565,13
+0,9.96812192337,313.278109696,0.00863026595671,13
+0,10.0181739194,36.7378924562,2.92179879835,13
+0,9.89000102695,164.273723971,0.685222591968,13
+0,10.1555212436,320.451459462,2.01341536261,13
+0,10.0085727613,999.767117646,0.462294934168,13
+1,9.93099658724,5.17478203909,0.213855205032,13
+0,10.0629454957,663.088181857,0.049022351462,13
+0,10.1109732417,734.904569784,1.6998450094,13
+0,1006.6015266,505.023453703,1.90870566777,14
+0,991.865769489,245.437343115,0.475109744256,14
+0,998.682734072,950.041057232,1.9256314201,14
+0,1005.02207209,2.9619314197,0.0517146822357,14
+0,1002.54526214,860.562681899,0.915687092848,14
+0,1000.38847359,808.416525088,0.209690673808,14
+1,992.557818382,373.889409453,0.107571728577,14
+0,1002.07722137,997.329626371,1.06504260496,14
+0,1000.40504333,949.832139189,0.539159980327,14
+0,10.1460179902,8.86082969819,135.953842715,15
+1,9.98529296553,2.87366448495,1.74249892194,15
+0,9.88942676744,9.4031821056,149.473066381,15
+1,10.0192953341,1.99685737576,1.79502473397,15
+0,10.0110654379,8.13112593726,87.7765628103,15
+0,997.148677047,733.936190093,1.49298494242,16
+0,1008.70465919,957.121652078,0.217414013634,16
+1,997.356154278,541.599587807,0.100855972216,16
+0,999.615897283,943.700501824,0.862874175879,16
+1,997.36859077,0.200859940848,0.13601892182,16
+0,10.0423255624,1.73855202168,0.956695338485,17
+1,9.88440755486,9.9994600678,0.305080529665,17
+0,10.0891026412,3.28031719474,0.364450973697,17
+0,9.90078644258,8.77839663617,0.456660574479,17
+1,9.79380029711,8.77220326156,0.527292005175,17
+0,9.93613887011,9.76270841268,1.40865693823,17
+0,10.0009239007,7.29056178263,0.498015866607,17
+0,9.96603319905,5.12498000925,0.517492532783,17
+0,10.0923827222,2.76652583955,1.56571226159,17
+1,10.0983782035,587.788120694,0.031756483687,18
+1,9.91397225464,994.527496819,3.72092164978,18
+0,10.1057472738,2.92894440088,0.683506438532,18
+0,10.1014053354,959.082038017,1.07039624129,18
+0,10.1433253044,322.515119317,0.51408278993,18
+1,9.82832510699,637.104433908,0.250272776427,18
+0,1000.49729075,2.75336888111,0.576634423274,19
+1,984.90338088,0.0295435794035,1.26273339929,19
+0,1001.53811442,4.64164410861,0.0293389959504,19
+1,995.875898395,5.08223403205,0.382330566779,19
+0,996.405937252,6.26395190757,0.453645816611,19
+0,10.0165140779,340.126072514,0.220794603312,20
+0,9.93482824816,951.672000448,0.124406293612,20
+0,10.1700278554,0.0140985961008,0.252452256311,20
+0,9.99825079542,950.382643896,0.875382402062,20
+0,9.87316410028,686.788257829,0.215886999825,20
+0,10.2893240654,89.3947931451,0.569578232133,20
+0,9.98689192703,0.430107535413,2.99869831728,20
+0,10.1365175107,972.279245093,0.0865099386744,20
+0,9.90744703306,50.810461183,3.00863325197,20
diff --git a/plugin/updater_oneapi/predictor_oneapi.cc b/plugin/updater_oneapi/predictor_oneapi.cc
index 59b170b28..25a14186c 100755
--- a/plugin/updater_oneapi/predictor_oneapi.cc
+++ b/plugin/updater_oneapi/predictor_oneapi.cc
@@ -1,447 +1,447 @@
-/*!
- * Copyright by Contributors 2017-2020
- */
-#include <any>  // for any
-#include <cstddef>
-#include <limits>
-#include <mutex>
-
-#include "../../src/common/math.h"
-#include "../../src/data/adapter.h"
-#include "../../src/gbm/gbtree_model.h"
-#include "CL/sycl.hpp"
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/logging.h"
-#include "xgboost/predictor.h"
-#include "xgboost/tree_model.h"
-#include "xgboost/tree_updater.h"
-
-namespace xgboost {
-namespace predictor {
-
-DMLC_REGISTRY_FILE_TAG(predictor_oneapi);
-
-/*! \brief Element from a sparse vector */
-struct EntryOneAPI {
-  /*! \brief feature index */
-  bst_feature_t index;
-  /*! \brief feature value */
-  bst_float fvalue;
-  /*! \brief default constructor */
-  EntryOneAPI() = default;
-  /*!
-   * \brief constructor with index and value
-   * \param index The feature or row index.
-   * \param fvalue The feature value.
-   */
-  EntryOneAPI(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {}
-
-  EntryOneAPI(const Entry& entry) : index(entry.index), fvalue(entry.fvalue) {}
-
-  /*! \brief reversely compare feature values */
-  inline static bool CmpValue(const EntryOneAPI& a, const EntryOneAPI& b) {
-    return a.fvalue < b.fvalue;
-  }
-  inline bool operator==(const EntryOneAPI& other) const {
-    return (this->index == other.index && this->fvalue == other.fvalue);
-  }
-};
-
-struct DeviceMatrixOneAPI {
-  DMatrix* p_mat;  // Pointer to the original matrix on the host
-  cl::sycl::queue qu_;
-  size_t* row_ptr;
-  size_t row_ptr_size;
-  EntryOneAPI* data;
-
-  DeviceMatrixOneAPI(DMatrix* dmat, cl::sycl::queue qu) : p_mat(dmat), qu_(qu) {
-    size_t num_row = 0;
-    size_t num_nonzero = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const auto& data_vec = batch.data.HostVector();
-      const auto& offset_vec = batch.offset.HostVector();
-      num_nonzero += data_vec.size();
-      num_row += batch.Size();
-    }
-
-    row_ptr = cl::sycl::malloc_shared<size_t>(num_row + 1, qu_);
-    data = cl::sycl::malloc_shared<EntryOneAPI>(num_nonzero, qu_);
-
-    size_t data_offset = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const auto& data_vec = batch.data.HostVector();
-      const auto& offset_vec = batch.offset.HostVector();
-      size_t batch_size = batch.Size();
-      if (batch_size > 0) {
-        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
-                  row_ptr + batch.base_rowid);
-        if (batch.base_rowid > 0) {
-          for(size_t i = 0; i < batch_size; i++)
-            row_ptr[i + batch.base_rowid] += batch.base_rowid;
-        }
-        std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size],
-                  data + data_offset);
-        data_offset += offset_vec[batch_size];
-      }
-    }
-    row_ptr[num_row] = data_offset;
-    row_ptr_size = num_row + 1;
-  }
-
-  ~DeviceMatrixOneAPI() {
-    if (row_ptr) {
-      cl::sycl::free(row_ptr, qu_);
-    }
-    if (data) {
-      cl::sycl::free(data, qu_);
-    }
-  }
-};
-
-struct DeviceNodeOneAPI {
-  DeviceNodeOneAPI()
-      : fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
-
-  union NodeValue {
-    float leaf_weight;
-    float fvalue;
-  };
-
-  int fidx;
-  int left_child_idx;
-  int right_child_idx;
-  NodeValue val;
-
-  DeviceNodeOneAPI(const RegTree::Node& n) {  // NOLINT
-    this->left_child_idx = n.LeftChild();
-    this->right_child_idx = n.RightChild();
-    this->fidx = n.SplitIndex();
-    if (n.DefaultLeft()) {
-      fidx |= (1U << 31);
-    }
-
-    if (n.IsLeaf()) {
-      this->val.leaf_weight = n.LeafValue();
-    } else {
-      this->val.fvalue = n.SplitCond();
-    }
-  }
-
-  bool IsLeaf() const { return left_child_idx == -1; }
-
-  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
-
-  bool MissingLeft() const { return (fidx >> 31) != 0; }
-
-  int MissingIdx() const {
-    if (MissingLeft()) {
-      return this->left_child_idx;
-    } else {
-      return this->right_child_idx;
-    }
-  }
-
-  float GetFvalue() const { return val.fvalue; }
-
-  float GetWeight() const { return val.leaf_weight; }
-};
-
-class DeviceModelOneAPI {
- public:
-  cl::sycl::queue qu_;
-  DeviceNodeOneAPI* nodes;
-  size_t* tree_segments;
-  int* tree_group;
-  size_t tree_beg_;
-  size_t tree_end_;
-  int num_group;
-
-  DeviceModelOneAPI() : nodes(nullptr), tree_segments(nullptr), tree_group(nullptr) {}
-
-  ~DeviceModelOneAPI() {
-    Reset();
-  }
-
-  void Reset() {
-    if (nodes)
-      cl::sycl::free(nodes, qu_);
-    if (tree_segments)
-      cl::sycl::free(tree_segments, qu_);
-    if (tree_group)
-      cl::sycl::free(tree_group, qu_);
-  }
-
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, cl::sycl::queue qu) {
-    qu_ = qu;
-    CHECK_EQ(model.param.size_leaf_vector, 0);
-    Reset();
-
-    tree_segments = cl::sycl::malloc_shared<size_t>((tree_end - tree_begin) + 1, qu_);
-    int sum = 0;
-    tree_segments[0] = sum;
-    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      sum += model.trees[tree_idx]->GetNodes().size();
-      tree_segments[tree_idx - tree_begin + 1] = sum;
-    }
-
-    nodes = cl::sycl::malloc_shared<DeviceNodeOneAPI>(sum, qu_);
-    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto& src_nodes = model.trees[tree_idx]->GetNodes();
-      for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++)
-        nodes[node_idx + tree_segments[tree_idx - tree_begin]] = src_nodes[node_idx];
-    }
-
-    tree_group = cl::sycl::malloc_shared<int>(model.tree_info.size(), qu_);
-    for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
-      tree_group[tree_idx] = model.tree_info[tree_idx];
-
-    tree_beg_ = tree_begin;
-    tree_end_ = tree_end;
-    num_group = model.learner_model_param->num_output_group;
-  }
-};
-
-float GetFvalue(int ridx, int fidx, EntryOneAPI* data, size_t* row_ptr, bool& is_missing) {
-  // Binary search
-  auto begin_ptr = data + row_ptr[ridx];
-  auto end_ptr = data + row_ptr[ridx + 1];
-  EntryOneAPI* previous_middle = nullptr;
-  while (end_ptr != begin_ptr) {
-    auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
-    if (middle == previous_middle) {
-      break;
-    } else {
-      previous_middle = middle;
-    }
-
-    if (middle->index == fidx) {
-      is_missing = false;
-      return middle->fvalue;
-    } else if (middle->index < fidx) {
-      begin_ptr = middle;
-    } else {
-      end_ptr = middle;
-    }
-  }
-  is_missing = true;
-  return 0.0;
-}
-
-float GetLeafWeight(int ridx, const DeviceNodeOneAPI* tree, EntryOneAPI* data, size_t* row_ptr) {
-  DeviceNodeOneAPI n = tree[0];
-  int node_id = 0;
-  bool is_missing;
-  while (!n.IsLeaf()) {
-    float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, is_missing);
-    // Missing value
-    if (is_missing) {
-      n = tree[n.MissingIdx()];
-    } else {
-      if (fvalue < n.GetFvalue()) {
-        node_id = n.left_child_idx;
-        n = tree[n.left_child_idx];
-      } else {
-        node_id = n.right_child_idx;
-        n = tree[n.right_child_idx];
-      }
-    }
-  }
-  return n.GetWeight();
-}
-
-class PredictorOneAPI : public Predictor {
- protected:
-  void InitOutPredictions(const MetaInfo& info,
-                          HostDeviceVector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const {
-    CHECK_NE(model.learner_model_param->num_output_group, 0);
-    size_t n = model.learner_model_param->num_output_group * info.num_row_;
-    const auto& base_margin = info.base_margin_.HostVector();
-    out_preds->Resize(n);
-    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
-    if (base_margin.size() == n) {
-      CHECK_EQ(out_preds->Size(), n);
-      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
-    } else {
-      if (!base_margin.empty()) {
-        std::ostringstream oss;
-        oss << "Ignoring the base margin, since it has incorrect length. "
-            << "The base margin must be an array of length ";
-        if (model.learner_model_param->num_output_group > 1) {
-          oss << "[num_class] * [number of data points], i.e. "
-              << model.learner_model_param->num_output_group << " * " << info.num_row_
-              << " = " << n << ". ";
-        } else {
-          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
-        }
-        oss << "Instead, all data points will use "
-            << "base_score = " << model.learner_model_param->base_score;
-        LOG(WARNING) << oss.str();
-      }
-      std::fill(out_preds_h.begin(), out_preds_h.end(),
-                model.learner_model_param->base_score);
-    }
-  }
-
-  void DevicePredictInternal(DeviceMatrixOneAPI* dmat, HostDeviceVector<float>* out_preds,
-                             const gbm::GBTreeModel& model, size_t tree_begin,
-                             size_t tree_end) {
-    if (tree_end - tree_begin == 0) {
-      return;
-    }
-    model_.Init(model, tree_begin, tree_end, qu_);
-
-    auto& out_preds_vec = out_preds->HostVector();
-
-    DeviceNodeOneAPI* nodes = model_.nodes;
-    cl::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
-    size_t* tree_segments = model_.tree_segments;
-    int* tree_group = model_.tree_group;
-    size_t* row_ptr = dmat->row_ptr;
-    EntryOneAPI* data = dmat->data;
-    int num_features = dmat->p_mat->Info().num_col_;
-    int num_rows = dmat->row_ptr_size - 1;
-    int num_group = model.learner_model_param->num_output_group;
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto out_predictions = out_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<class PredictInternal>(cl::sycl::range<1>(num_rows), [=](cl::sycl::id<1> pid) {
-        int global_idx = pid[0];
-        if (global_idx >= num_rows) return;
-        if (num_group == 1) {
-          float sum = 0.0;
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
-            sum += GetLeafWeight(global_idx, tree, data, row_ptr);
-          }
-          out_predictions[global_idx] += sum;
-        } else {
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
-            int out_prediction_idx = global_idx * num_group + tree_group[tree_idx];
-            out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr);
-          }
-        }
-      });
-    }).wait();
-  }
-
- public:
-  explicit PredictorOneAPI(Context const* generic_param) :
-      Predictor::Predictor{generic_param}, cpu_predictor(Predictor::Create("cpu_predictor", generic_param)) {
-    cl::sycl::default_selector selector;
-    qu_ = cl::sycl::queue(selector);
-  }
-
-  // ntree_limit is a very problematic parameter, as it's ambiguous in the context of
-  // multi-output and forest.  Same problem exists for tree_begin
-  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
-                    const gbm::GBTreeModel& model, int tree_begin,
-                    uint32_t const ntree_limit = 0) override {
-    if (this->device_matrix_cache_.find(dmat) ==
-        this->device_matrix_cache_.end()) {
-      this->device_matrix_cache_.emplace(
-          dmat, std::unique_ptr<DeviceMatrixOneAPI>(
-                    new DeviceMatrixOneAPI(dmat, qu_)));
-    }
-    DeviceMatrixOneAPI* device_matrix = device_matrix_cache_.find(dmat)->second.get();
-
-    // tree_begin is not used, right now we just enforce it to be 0.
-    CHECK_EQ(tree_begin, 0);
-    auto* out_preds = &predts->predictions;
-    CHECK_GE(predts->version, tree_begin);
-    if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) {
-      CHECK_EQ(predts->version, 0);
-    }
-    if (predts->version == 0) {
-      // out_preds->Size() can be non-zero as it's initialized here before any tree is
-      // built at the 0^th iterator.
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-    }
-
-    uint32_t const output_groups = model.learner_model_param->num_output_group;
-    CHECK_NE(output_groups, 0);
-    // Right now we just assume ntree_limit provided by users means number of tree layers
-    // in the context of multi-output model
-    uint32_t real_ntree_limit = ntree_limit * output_groups;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
-    }
-
-    uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups;
-    // When users have provided ntree_limit, end_version can be lesser, cache is violated
-    if (predts->version > end_version) {
-      CHECK_NE(ntree_limit, 0);
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-      predts->version = 0;
-    }
-    uint32_t const beg_version = predts->version;
-    CHECK_LE(beg_version, end_version);
-
-    if (beg_version < end_version) {
-      DevicePredictInternal(device_matrix, out_preds, model,
-                            beg_version * output_groups,
-                            end_version * output_groups);
-    }
-
-    // delta means {size of forest} * {number of newly accumulated layers}
-    uint32_t delta = end_version - beg_version;
-    CHECK_LE(delta, model.trees.size());
-    predts->Update(delta);
-
-    CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ ||
-          out_preds->Size() == dmat->Info().num_row_);
-  }
-
-  void InplacePredict(std::any const& x, const gbm::GBTreeModel& model, float missing,
-                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
-                      unsigned tree_end) const override {
-    cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end);
-  }
-
-  void PredictInstance(const SparsePage::Inst& inst,
-                       std::vector<bst_float>* out_preds,
-                       const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit);
-  }
-
-  void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
-                   const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
-  }
-
-  void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
-                           const gbm::GBTreeModel& model, uint32_t ntree_limit,
-                           std::vector<bst_float>* tree_weights,
-                           bool approximate, int condition,
-                           unsigned condition_feature) override {
-    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate, condition, condition_feature);
-  }
-
-  void PredictInteractionContributions(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
-                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
-                                       std::vector<bst_float>* tree_weights,
-                                       bool approximate) override {
-    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate);
-  }
-
- private:
-  cl::sycl::queue qu_;
-  DeviceModelOneAPI model_;
-
-  std::mutex lock_;
-  std::unique_ptr<Predictor> cpu_predictor;
-
-  std::unordered_map<DMatrix*, std::unique_ptr<DeviceMatrixOneAPI>>
-      device_matrix_cache_;
-};
-
-XGBOOST_REGISTER_PREDICTOR(PredictorOneAPI, "oneapi_predictor")
-.describe("Make predictions using DPC++.")
-.set_body([](Context const* generic_param) {
-            return new PredictorOneAPI(generic_param);
-          });
-}  // namespace predictor
-}  // namespace xgboost
+/*!
+ * Copyright by Contributors 2017-2020
+ */
+#include <any>  // for any
+#include <cstddef>
+#include <limits>
+#include <mutex>
+
+#include "../../src/common/math.h"
+#include "../../src/data/adapter.h"
+#include "../../src/gbm/gbtree_model.h"
+#include "CL/sycl.hpp"
+#include "xgboost/base.h"
+#include "xgboost/data.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/logging.h"
+#include "xgboost/predictor.h"
+#include "xgboost/tree_model.h"
+#include "xgboost/tree_updater.h"
+
+namespace xgboost {
+namespace predictor {
+
+DMLC_REGISTRY_FILE_TAG(predictor_oneapi);
+
+/*! \brief Element from a sparse vector */
+struct EntryOneAPI {
+  /*! \brief feature index */
+  bst_feature_t index;
+  /*! \brief feature value */
+  bst_float fvalue;
+  /*! \brief default constructor */
+  EntryOneAPI() = default;
+  /*!
+   * \brief constructor with index and value
+   * \param index The feature or row index.
+   * \param fvalue The feature value.
+   */
+  EntryOneAPI(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {}
+
+  EntryOneAPI(const Entry& entry) : index(entry.index), fvalue(entry.fvalue) {}
+
+  /*! \brief reversely compare feature values */
+  inline static bool CmpValue(const EntryOneAPI& a, const EntryOneAPI& b) {
+    return a.fvalue < b.fvalue;
+  }
+  inline bool operator==(const EntryOneAPI& other) const {
+    return (this->index == other.index && this->fvalue == other.fvalue);
+  }
+};
+
+struct DeviceMatrixOneAPI {
+  DMatrix* p_mat;  // Pointer to the original matrix on the host
+  cl::sycl::queue qu_;
+  size_t* row_ptr;
+  size_t row_ptr_size;
+  EntryOneAPI* data;
+
+  DeviceMatrixOneAPI(DMatrix* dmat, cl::sycl::queue qu) : p_mat(dmat), qu_(qu) {
+    size_t num_row = 0;
+    size_t num_nonzero = 0;
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const auto& data_vec = batch.data.HostVector();
+      const auto& offset_vec = batch.offset.HostVector();
+      num_nonzero += data_vec.size();
+      num_row += batch.Size();
+    }
+
+    row_ptr = cl::sycl::malloc_shared<size_t>(num_row + 1, qu_);
+    data = cl::sycl::malloc_shared<EntryOneAPI>(num_nonzero, qu_);
+
+    size_t data_offset = 0;
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const auto& data_vec = batch.data.HostVector();
+      const auto& offset_vec = batch.offset.HostVector();
+      size_t batch_size = batch.Size();
+      if (batch_size > 0) {
+        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
+                  row_ptr + batch.base_rowid);
+        if (batch.base_rowid > 0) {
+          for(size_t i = 0; i < batch_size; i++)
+            row_ptr[i + batch.base_rowid] += batch.base_rowid;
+        }
+        std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size],
+                  data + data_offset);
+        data_offset += offset_vec[batch_size];
+      }
+    }
+    row_ptr[num_row] = data_offset;
+    row_ptr_size = num_row + 1;
+  }
+
+  ~DeviceMatrixOneAPI() {
+    if (row_ptr) {
+      cl::sycl::free(row_ptr, qu_);
+    }
+    if (data) {
+      cl::sycl::free(data, qu_);
+    }
+  }
+};
+
+struct DeviceNodeOneAPI {
+  DeviceNodeOneAPI()
+      : fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
+
+  union NodeValue {
+    float leaf_weight;
+    float fvalue;
+  };
+
+  int fidx;
+  int left_child_idx;
+  int right_child_idx;
+  NodeValue val;
+
+  DeviceNodeOneAPI(const RegTree::Node& n) {  // NOLINT
+    this->left_child_idx = n.LeftChild();
+    this->right_child_idx = n.RightChild();
+    this->fidx = n.SplitIndex();
+    if (n.DefaultLeft()) {
+      fidx |= (1U << 31);
+    }
+
+    if (n.IsLeaf()) {
+      this->val.leaf_weight = n.LeafValue();
+    } else {
+      this->val.fvalue = n.SplitCond();
+    }
+  }
+
+  bool IsLeaf() const { return left_child_idx == -1; }
+
+  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
+
+  bool MissingLeft() const { return (fidx >> 31) != 0; }
+
+  int MissingIdx() const {
+    if (MissingLeft()) {
+      return this->left_child_idx;
+    } else {
+      return this->right_child_idx;
+    }
+  }
+
+  float GetFvalue() const { return val.fvalue; }
+
+  float GetWeight() const { return val.leaf_weight; }
+};
+
+class DeviceModelOneAPI {
+ public:
+  cl::sycl::queue qu_;
+  DeviceNodeOneAPI* nodes;
+  size_t* tree_segments;
+  int* tree_group;
+  size_t tree_beg_;
+  size_t tree_end_;
+  int num_group;
+
+  DeviceModelOneAPI() : nodes(nullptr), tree_segments(nullptr), tree_group(nullptr) {}
+
+  ~DeviceModelOneAPI() {
+    Reset();
+  }
+
+  void Reset() {
+    if (nodes)
+      cl::sycl::free(nodes, qu_);
+    if (tree_segments)
+      cl::sycl::free(tree_segments, qu_);
+    if (tree_group)
+      cl::sycl::free(tree_group, qu_);
+  }
+
+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, cl::sycl::queue qu) {
+    qu_ = qu;
+    CHECK_EQ(model.param.size_leaf_vector, 0);
+    Reset();
+
+    tree_segments = cl::sycl::malloc_shared<size_t>((tree_end - tree_begin) + 1, qu_);
+    int sum = 0;
+    tree_segments[0] = sum;
+    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      sum += model.trees[tree_idx]->GetNodes().size();
+      tree_segments[tree_idx - tree_begin + 1] = sum;
+    }
+
+    nodes = cl::sycl::malloc_shared<DeviceNodeOneAPI>(sum, qu_);
+    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      auto& src_nodes = model.trees[tree_idx]->GetNodes();
+      for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++)
+        nodes[node_idx + tree_segments[tree_idx - tree_begin]] = src_nodes[node_idx];
+    }
+
+    tree_group = cl::sycl::malloc_shared<int>(model.tree_info.size(), qu_);
+    for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
+      tree_group[tree_idx] = model.tree_info[tree_idx];
+
+    tree_beg_ = tree_begin;
+    tree_end_ = tree_end;
+    num_group = model.learner_model_param->num_output_group;
+  }
+};
+
+float GetFvalue(int ridx, int fidx, EntryOneAPI* data, size_t* row_ptr, bool& is_missing) {
+  // Binary search
+  auto begin_ptr = data + row_ptr[ridx];
+  auto end_ptr = data + row_ptr[ridx + 1];
+  EntryOneAPI* previous_middle = nullptr;
+  while (end_ptr != begin_ptr) {
+    auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
+    if (middle == previous_middle) {
+      break;
+    } else {
+      previous_middle = middle;
+    }
+
+    if (middle->index == fidx) {
+      is_missing = false;
+      return middle->fvalue;
+    } else if (middle->index < fidx) {
+      begin_ptr = middle;
+    } else {
+      end_ptr = middle;
+    }
+  }
+  is_missing = true;
+  return 0.0;
+}
+
+float GetLeafWeight(int ridx, const DeviceNodeOneAPI* tree, EntryOneAPI* data, size_t* row_ptr) {
+  DeviceNodeOneAPI n = tree[0];
+  int node_id = 0;
+  bool is_missing;
+  while (!n.IsLeaf()) {
+    float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, is_missing);
+    // Missing value
+    if (is_missing) {
+      n = tree[n.MissingIdx()];
+    } else {
+      if (fvalue < n.GetFvalue()) {
+        node_id = n.left_child_idx;
+        n = tree[n.left_child_idx];
+      } else {
+        node_id = n.right_child_idx;
+        n = tree[n.right_child_idx];
+      }
+    }
+  }
+  return n.GetWeight();
+}
+
+class PredictorOneAPI : public Predictor {
+ protected:
+  void InitOutPredictions(const MetaInfo& info,
+                          HostDeviceVector<bst_float>* out_preds,
+                          const gbm::GBTreeModel& model) const {
+    CHECK_NE(model.learner_model_param->num_output_group, 0);
+    size_t n = model.learner_model_param->num_output_group * info.num_row_;
+    const auto& base_margin = info.base_margin_.HostVector();
+    out_preds->Resize(n);
+    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
+    if (base_margin.size() == n) {
+      CHECK_EQ(out_preds->Size(), n);
+      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
+    } else {
+      if (!base_margin.empty()) {
+        std::ostringstream oss;
+        oss << "Ignoring the base margin, since it has incorrect length. "
+            << "The base margin must be an array of length ";
+        if (model.learner_model_param->num_output_group > 1) {
+          oss << "[num_class] * [number of data points], i.e. "
+              << model.learner_model_param->num_output_group << " * " << info.num_row_
+              << " = " << n << ". ";
+        } else {
+          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
+        }
+        oss << "Instead, all data points will use "
+            << "base_score = " << model.learner_model_param->base_score;
+        LOG(WARNING) << oss.str();
+      }
+      std::fill(out_preds_h.begin(), out_preds_h.end(),
+                model.learner_model_param->base_score);
+    }
+  }
+
+  void DevicePredictInternal(DeviceMatrixOneAPI* dmat, HostDeviceVector<float>* out_preds,
+                             const gbm::GBTreeModel& model, size_t tree_begin,
+                             size_t tree_end) {
+    if (tree_end - tree_begin == 0) {
+      return;
+    }
+    model_.Init(model, tree_begin, tree_end, qu_);
+
+    auto& out_preds_vec = out_preds->HostVector();
+
+    DeviceNodeOneAPI* nodes = model_.nodes;
+    cl::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
+    size_t* tree_segments = model_.tree_segments;
+    int* tree_group = model_.tree_group;
+    size_t* row_ptr = dmat->row_ptr;
+    EntryOneAPI* data = dmat->data;
+    int num_features = dmat->p_mat->Info().num_col_;
+    int num_rows = dmat->row_ptr_size - 1;
+    int num_group = model.learner_model_param->num_output_group;
+
+    qu_.submit([&](cl::sycl::handler& cgh) {
+      auto out_predictions = out_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+      cgh.parallel_for<class PredictInternal>(cl::sycl::range<1>(num_rows), [=](cl::sycl::id<1> pid) {
+        int global_idx = pid[0];
+        if (global_idx >= num_rows) return;
+        if (num_group == 1) {
+          float sum = 0.0;
+          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
+            sum += GetLeafWeight(global_idx, tree, data, row_ptr);
+          }
+          out_predictions[global_idx] += sum;
+        } else {
+          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
+            int out_prediction_idx = global_idx * num_group + tree_group[tree_idx];
+            out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr);
+          }
+        }
+      });
+    }).wait();
+  }
+
+ public:
+  explicit PredictorOneAPI(Context const* generic_param) :
+      Predictor::Predictor{generic_param}, cpu_predictor(Predictor::Create("cpu_predictor", generic_param)) {
+    cl::sycl::default_selector selector;
+    qu_ = cl::sycl::queue(selector);
+  }
+
+  // ntree_limit is a very problematic parameter, as it's ambiguous in the context of
+  // multi-output and forest.  Same problem exists for tree_begin
+  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
+                    const gbm::GBTreeModel& model, int tree_begin,
+                    uint32_t const ntree_limit = 0) override {
+    if (this->device_matrix_cache_.find(dmat) ==
+        this->device_matrix_cache_.end()) {
+      this->device_matrix_cache_.emplace(
+          dmat, std::unique_ptr<DeviceMatrixOneAPI>(
+                    new DeviceMatrixOneAPI(dmat, qu_)));
+    }
+    DeviceMatrixOneAPI* device_matrix = device_matrix_cache_.find(dmat)->second.get();
+
+    // tree_begin is not used, right now we just enforce it to be 0.
+    CHECK_EQ(tree_begin, 0);
+    auto* out_preds = &predts->predictions;
+    CHECK_GE(predts->version, tree_begin);
+    if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) {
+      CHECK_EQ(predts->version, 0);
+    }
+    if (predts->version == 0) {
+      // out_preds->Size() can be non-zero as it's initialized here before any tree is
+      // built at the 0^th iterator.
+      this->InitOutPredictions(dmat->Info(), out_preds, model);
+    }
+
+    uint32_t const output_groups = model.learner_model_param->num_output_group;
+    CHECK_NE(output_groups, 0);
+    // Right now we just assume ntree_limit provided by users means number of tree layers
+    // in the context of multi-output model
+    uint32_t real_ntree_limit = ntree_limit * output_groups;
+    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
+      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
+    }
+
+    uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups;
+    // When users have provided ntree_limit, end_version can be lesser, cache is violated
+    if (predts->version > end_version) {
+      CHECK_NE(ntree_limit, 0);
+      this->InitOutPredictions(dmat->Info(), out_preds, model);
+      predts->version = 0;
+    }
+    uint32_t const beg_version = predts->version;
+    CHECK_LE(beg_version, end_version);
+
+    if (beg_version < end_version) {
+      DevicePredictInternal(device_matrix, out_preds, model,
+                            beg_version * output_groups,
+                            end_version * output_groups);
+    }
+
+    // delta means {size of forest} * {number of newly accumulated layers}
+    uint32_t delta = end_version - beg_version;
+    CHECK_LE(delta, model.trees.size());
+    predts->Update(delta);
+
+    CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ ||
+          out_preds->Size() == dmat->Info().num_row_);
+  }
+
+  void InplacePredict(std::any const& x, const gbm::GBTreeModel& model, float missing,
+                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
+                      unsigned tree_end) const override {
+    cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end);
+  }
+
+  void PredictInstance(const SparsePage::Inst& inst,
+                       std::vector<bst_float>* out_preds,
+                       const gbm::GBTreeModel& model, unsigned ntree_limit) override {
+    cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit);
+  }
+
+  void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
+                   const gbm::GBTreeModel& model, unsigned ntree_limit) override {
+    cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
+  }
+
+  void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
+                           const gbm::GBTreeModel& model, uint32_t ntree_limit,
+                           std::vector<bst_float>* tree_weights,
+                           bool approximate, int condition,
+                           unsigned condition_feature) override {
+    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate, condition, condition_feature);
+  }
+
+  void PredictInteractionContributions(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
+                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
+                                       std::vector<bst_float>* tree_weights,
+                                       bool approximate) override {
+    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate);
+  }
+
+ private:
+  cl::sycl::queue qu_;
+  DeviceModelOneAPI model_;
+
+  std::mutex lock_;
+  std::unique_ptr<Predictor> cpu_predictor;
+
+  std::unordered_map<DMatrix*, std::unique_ptr<DeviceMatrixOneAPI>>
+      device_matrix_cache_;
+};
+
+XGBOOST_REGISTER_PREDICTOR(PredictorOneAPI, "oneapi_predictor")
+.describe("Make predictions using DPC++.")
+.set_body([](Context const* generic_param) {
+            return new PredictorOneAPI(generic_param);
+          });
+}  // namespace predictor
+}  // namespace xgboost
diff --git a/plugin/updater_oneapi/regression_loss_oneapi.h b/plugin/updater_oneapi/regression_loss_oneapi.h
index 4759f5c3f..b0299ff7f 100755
--- a/plugin/updater_oneapi/regression_loss_oneapi.h
+++ b/plugin/updater_oneapi/regression_loss_oneapi.h
@@ -1,145 +1,145 @@
-/*!
- * Copyright 2017-2020 XGBoost contributors
- */
-#ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
-#define XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
-
-#include <dmlc/omp.h>
-#include <xgboost/logging.h>
-#include <algorithm>
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace obj {
-
-/*!
- * \brief calculate the sigmoid of the input.
- * \param x input parameter
- * \return the transformed value.
- */
-inline float SigmoidOneAPI(float x) {
-  return 1.0f / (1.0f + cl::sycl::exp(-x));
-}
-
-// common regressions
-// linear regression
-struct LinearSquareLossOneAPI {
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bool CheckLabel(bst_float x) { return true; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    return 1.0f;
-  }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
-  static const char* LabelErrorMsg() { return ""; }
-  static const char* DefaultEvalMetric() { return "rmse"; }
-
-  static const char* Name() { return "reg:squarederror_oneapi"; }
-};
-
-// TODO: DPC++ does not fully support std math inside offloaded kernels
-struct SquaredLogErrorOneAPI {
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bool CheckLabel(bst_float label) {
-    return label > -1;
-  }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    predt = std::max(predt, (bst_float)(-1 + 1e-6));  // ensure correct value for log1p
-    return (cl::sycl::log1p(predt) - cl::sycl::log1p(label)) / (predt + 1);
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    predt = std::max(predt, (bst_float)(-1 + 1e-6));
-    float res = (-cl::sycl::log1p(predt) + cl::sycl::log1p(label) + 1) /
-                cl::sycl::pow(predt + 1, (bst_float)2);
-    res = std::max(res, (bst_float)1e-6f);
-    return res;
-  }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
-  static const char* LabelErrorMsg() {
-    return "label must be greater than -1 for rmsle so that log(label + 1) can be valid.";
-  }
-  static const char* DefaultEvalMetric() { return "rmsle"; }
-
-  static const char* Name() { return "reg:squaredlogerror_oneapi"; }
-};
-
-// logistic loss for probability regression task
-struct LogisticRegressionOneAPI {
-  // duplication is necessary, as __device__ specifier
-  // cannot be made conditional on template parameter
-  static bst_float PredTransform(bst_float x) { return SigmoidOneAPI(x); }
-  static bool CheckLabel(bst_float x) { return x >= 0.0f && x <= 1.0f; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    const bst_float eps = 1e-16f;
-    return std::max(predt * (1.0f - predt), eps);
-  }
-  template <typename T>
-  static T PredTransform(T x) { return SigmoidOneAPI(x); }
-  template <typename T>
-  static T FirstOrderGradient(T predt, T label) { return predt - label; }
-  template <typename T>
-  static T SecondOrderGradient(T predt, T label) {
-    const T eps = T(1e-16f);
-    return std::max(predt * (T(1.0f) - predt), eps);
-  }
-  static bst_float ProbToMargin(bst_float base_score) {
-    CHECK(base_score > 0.0f && base_score < 1.0f)
-        << "base_score must be in (0,1) for logistic loss, got: " << base_score;
-    return -logf(1.0f / base_score - 1.0f);
-  }
-  static const char* LabelErrorMsg() {
-    return "label must be in [0,1] for logistic regression";
-  }
-  static const char* DefaultEvalMetric() { return "rmse"; }
-
-  static const char* Name() { return "reg:logistic_oneapi"; }
-};
-
-// logistic loss for binary classification task
-struct LogisticClassificationOneAPI : public LogisticRegressionOneAPI {
-  static const char* DefaultEvalMetric() { return "logloss"; }
-  static const char* Name() { return "binary:logistic_oneapi"; }
-};
-
-// logistic loss, but predict un-transformed margin
-struct LogisticRawOneAPI : public LogisticRegressionOneAPI {
-  // duplication is necessary, as __device__ specifier
-  // cannot be made conditional on template parameter
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    predt = SigmoidOneAPI(predt);
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    const bst_float eps = 1e-16f;
-    predt = SigmoidOneAPI(predt);
-    return std::max(predt * (1.0f - predt), eps);
-  }
-  template <typename T>
-    static T PredTransform(T x) { return x; }
-  template <typename T>
-    static T FirstOrderGradient(T predt, T label) {
-    predt = SigmoidOneAPI(predt);
-    return predt - label;
-  }
-  template <typename T>
-    static T SecondOrderGradient(T predt, T label) {
-    const T eps = T(1e-16f);
-    predt = SigmoidOneAPI(predt);
-    return std::max(predt * (T(1.0f) - predt), eps);
-  }
-  static const char* DefaultEvalMetric() { return "logloss"; }
-
-  static const char* Name() { return "binary:logitraw_oneapi"; }
-};
-
-}  // namespace obj
-}  // namespace xgboost
-
-#endif  // XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
+/*!
+ * Copyright 2017-2020 XGBoost contributors
+ */
+#ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
+#define XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
+
+#include <dmlc/omp.h>
+#include <xgboost/logging.h>
+#include <algorithm>
+
+#include "CL/sycl.hpp"
+
+namespace xgboost {
+namespace obj {
+
+/*!
+ * \brief calculate the sigmoid of the input.
+ * \param x input parameter
+ * \return the transformed value.
+ */
+inline float SigmoidOneAPI(float x) {
+  return 1.0f / (1.0f + cl::sycl::exp(-x));
+}
+
+// common regressions
+// linear regression
+struct LinearSquareLossOneAPI {
+  static bst_float PredTransform(bst_float x) { return x; }
+  static bool CheckLabel(bst_float x) { return true; }
+  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    return predt - label;
+  }
+  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    return 1.0f;
+  }
+  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
+  static const char* LabelErrorMsg() { return ""; }
+  static const char* DefaultEvalMetric() { return "rmse"; }
+
+  static const char* Name() { return "reg:squarederror_oneapi"; }
+};
+
+// TODO: DPC++ does not fully support std math inside offloaded kernels
+struct SquaredLogErrorOneAPI {
+  static bst_float PredTransform(bst_float x) { return x; }
+  static bool CheckLabel(bst_float label) {
+    return label > -1;
+  }
+  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    predt = std::max(predt, (bst_float)(-1 + 1e-6));  // ensure correct value for log1p
+    return (cl::sycl::log1p(predt) - cl::sycl::log1p(label)) / (predt + 1);
+  }
+  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    predt = std::max(predt, (bst_float)(-1 + 1e-6));
+    float res = (-cl::sycl::log1p(predt) + cl::sycl::log1p(label) + 1) /
+                cl::sycl::pow(predt + 1, (bst_float)2);
+    res = std::max(res, (bst_float)1e-6f);
+    return res;
+  }
+  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
+  static const char* LabelErrorMsg() {
+    return "label must be greater than -1 for rmsle so that log(label + 1) can be valid.";
+  }
+  static const char* DefaultEvalMetric() { return "rmsle"; }
+
+  static const char* Name() { return "reg:squaredlogerror_oneapi"; }
+};
+
+// logistic loss for probability regression task
+struct LogisticRegressionOneAPI {
+  // duplication is necessary, as __device__ specifier
+  // cannot be made conditional on template parameter
+  static bst_float PredTransform(bst_float x) { return SigmoidOneAPI(x); }
+  static bool CheckLabel(bst_float x) { return x >= 0.0f && x <= 1.0f; }
+  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    return predt - label;
+  }
+  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    const bst_float eps = 1e-16f;
+    return std::max(predt * (1.0f - predt), eps);
+  }
+  template <typename T>
+  static T PredTransform(T x) { return SigmoidOneAPI(x); }
+  template <typename T>
+  static T FirstOrderGradient(T predt, T label) { return predt - label; }
+  template <typename T>
+  static T SecondOrderGradient(T predt, T label) {
+    const T eps = T(1e-16f);
+    return std::max(predt * (T(1.0f) - predt), eps);
+  }
+  static bst_float ProbToMargin(bst_float base_score) {
+    CHECK(base_score > 0.0f && base_score < 1.0f)
+        << "base_score must be in (0,1) for logistic loss, got: " << base_score;
+    return -logf(1.0f / base_score - 1.0f);
+  }
+  static const char* LabelErrorMsg() {
+    return "label must be in [0,1] for logistic regression";
+  }
+  static const char* DefaultEvalMetric() { return "rmse"; }
+
+  static const char* Name() { return "reg:logistic_oneapi"; }
+};
+
+// logistic loss for binary classification task
+struct LogisticClassificationOneAPI : public LogisticRegressionOneAPI {
+  static const char* DefaultEvalMetric() { return "logloss"; }
+  static const char* Name() { return "binary:logistic_oneapi"; }
+};
+
+// logistic loss, but predict un-transformed margin
+struct LogisticRawOneAPI : public LogisticRegressionOneAPI {
+  // duplication is necessary, as __device__ specifier
+  // cannot be made conditional on template parameter
+  static bst_float PredTransform(bst_float x) { return x; }
+  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
+    predt = SigmoidOneAPI(predt);
+    return predt - label;
+  }
+  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
+    const bst_float eps = 1e-16f;
+    predt = SigmoidOneAPI(predt);
+    return std::max(predt * (1.0f - predt), eps);
+  }
+  template <typename T>
+    static T PredTransform(T x) { return x; }
+  template <typename T>
+    static T FirstOrderGradient(T predt, T label) {
+    predt = SigmoidOneAPI(predt);
+    return predt - label;
+  }
+  template <typename T>
+    static T SecondOrderGradient(T predt, T label) {
+    const T eps = T(1e-16f);
+    predt = SigmoidOneAPI(predt);
+    return std::max(predt * (T(1.0f) - predt), eps);
+  }
+  static const char* DefaultEvalMetric() { return "logloss"; }
+
+  static const char* Name() { return "binary:logitraw_oneapi"; }
+};
+
+}  // namespace obj
+}  // namespace xgboost
+
+#endif  // XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
diff --git a/plugin/updater_oneapi/regression_obj_oneapi.cc b/plugin/updater_oneapi/regression_obj_oneapi.cc
index 4a1bd7229..3ee5741e7 100755
--- a/plugin/updater_oneapi/regression_obj_oneapi.cc
+++ b/plugin/updater_oneapi/regression_obj_oneapi.cc
@@ -1,182 +1,182 @@
-#include <xgboost/logging.h>
-#include <xgboost/objective.h>
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "xgboost/host_device_vector.h"
-#include "xgboost/json.h"
-#include "xgboost/parameter.h"
-#include "xgboost/span.h"
-
-#include "../../src/common/transform.h"
-#include "../../src/common/common.h"
-#include "./regression_loss_oneapi.h"
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace obj {
-
-DMLC_REGISTRY_FILE_TAG(regression_obj_oneapi);
-
-struct RegLossParamOneAPI : public XGBoostParameter<RegLossParamOneAPI> {
-  float scale_pos_weight;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(RegLossParamOneAPI) {
-    DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
-      .describe("Scale the weight of positive examples by this factor");
-  }
-};
-
-template<typename Loss>
-class RegLossObjOneAPI : public ObjFunction {
- protected:
-  HostDeviceVector<int> label_correct_;
-
- public:
-  RegLossObjOneAPI() = default;
-
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.UpdateAllowUnknown(args);
-
-    cl::sycl::default_selector selector;
-    qu_ = cl::sycl::queue(selector);
-  }
-
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo &info,
-                   int iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
-    if (info.labels_.Size() == 0U) {
-      LOG(WARNING) << "Label set is empty.";
-    }
-    CHECK_EQ(preds.Size(), info.labels_.Size())
-        << " " << "labels are not correctly provided"
-        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size() << ", "
-        << "Loss: " << Loss::Name();
-
-    size_t const ndata = preds.Size();
-    out_gpair->Resize(ndata);
-
-    // TODO: add label_correct check
-    label_correct_.Resize(1);
-    label_correct_.Fill(1);
-
-    bool is_null_weight = info.weights_.Size() == 0;
-
-    cl::sycl::buffer<bst_float, 1> preds_buf(preds.HostPointer(), preds.Size());
-    cl::sycl::buffer<bst_float, 1> labels_buf(info.labels_.HostPointer(), info.labels_.Size());
-    cl::sycl::buffer<GradientPair, 1> out_gpair_buf(out_gpair->HostPointer(), out_gpair->Size());
-    cl::sycl::buffer<bst_float, 1> weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(),
-                                               is_null_weight ? 1 : info.weights_.Size());
-
-	cl::sycl::buffer<int, 1> additional_input_buf(1);
-	{
-		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>();
-		additional_input_acc[0] = 1; // Fill the label_correct flag
-	}
-
-    auto scale_pos_weight = param_.scale_pos_weight;
-    if (!is_null_weight) {
-      CHECK_EQ(info.weights_.Size(), ndata)
-        << "Number of weights should be equal to number of data points.";
-    }
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto preds_acc            = preds_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto labels_acc           = labels_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto weights_acc          = weights_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto out_gpair_acc        = out_gpair_buf.get_access<cl::sycl::access::mode::write>(cgh);
-      auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>(cgh);
-      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
-        int idx = pid[0];
-        bst_float p = Loss::PredTransform(preds_acc[idx]);
-        bst_float w = is_null_weight ? 1.0f : weights_acc[idx];
-        bst_float label = labels_acc[idx];
-        if (label == 1.0f) {
-          w *= scale_pos_weight;
-        }
-        if (!Loss::CheckLabel(label)) {
-          // If there is an incorrect label, the host code will know.
-          additional_input_acc[0] = 0;
-        }
-        out_gpair_acc[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
-                                          Loss::SecondOrderGradient(p, label) * w);
-      });
-    }).wait();
-
-    int flag = 1;
-	{
-		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::read>();
-		flag = additional_input_acc[0];
-	}
-
-    if (flag == 0) {
-      LOG(FATAL) << Loss::LabelErrorMsg();
-    }
-  
-  }
-
- public:
-  const char* DefaultEvalMetric() const override {
-    return Loss::DefaultEvalMetric();
-  }
-
-  void PredTransform(HostDeviceVector<float> *io_preds) override {
-    size_t const ndata = io_preds->Size();
-
-    cl::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto io_preds_acc = io_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
-        int idx = pid[0];
-        io_preds_acc[idx] = Loss::PredTransform(io_preds_acc[idx]);
-      });
-    }).wait();
-  }
-
-  float ProbToMargin(float base_score) const override {
-    return Loss::ProbToMargin(base_score);
-  }
-
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["name"] = String(Loss::Name());
-    out["reg_loss_param"] = ToJson(param_);
-  }
-
-  void LoadConfig(Json const& in) override {
-    FromJson(in["reg_loss_param"], &param_);
-  }
-
- protected:
-  RegLossParamOneAPI param_;
-
-  cl::sycl::queue qu_;
-};
-
-// register the objective functions
-DMLC_REGISTER_PARAMETER(RegLossParamOneAPI);
-
-// TODO: Find a better way to dispatch names of DPC++ kernels with various template parameters of loss function
-XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegressionOneAPI, LinearSquareLossOneAPI::Name())
-.describe("Regression with squared error with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LinearSquareLossOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(SquareLogErrorOneAPI, SquaredLogErrorOneAPI::Name())
-.describe("Regression with root mean squared logarithmic error with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<SquaredLogErrorOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticRegressionOneAPI, LogisticRegressionOneAPI::Name())
-.describe("Logistic regression for probability regression task with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticRegressionOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticClassificationOneAPI, LogisticClassificationOneAPI::Name())
-.describe("Logistic regression for binary classification task with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticClassificationOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticRawOneAPI, LogisticRawOneAPI::Name())
-.describe("Logistic regression for classification, output score "
-          "before logistic transformation with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticRawOneAPI>(); });
-
-}  // namespace obj
-}  // namespace xgboost
+#include <xgboost/logging.h>
+#include <xgboost/objective.h>
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "xgboost/host_device_vector.h"
+#include "xgboost/json.h"
+#include "xgboost/parameter.h"
+#include "xgboost/span.h"
+
+#include "../../src/common/transform.h"
+#include "../../src/common/common.h"
+#include "./regression_loss_oneapi.h"
+
+#include "CL/sycl.hpp"
+
+namespace xgboost {
+namespace obj {
+
+DMLC_REGISTRY_FILE_TAG(regression_obj_oneapi);
+
+struct RegLossParamOneAPI : public XGBoostParameter<RegLossParamOneAPI> {
+  float scale_pos_weight;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(RegLossParamOneAPI) {
+    DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
+      .describe("Scale the weight of positive examples by this factor");
+  }
+};
+
+template<typename Loss>
+class RegLossObjOneAPI : public ObjFunction {
+ protected:
+  HostDeviceVector<int> label_correct_;
+
+ public:
+  RegLossObjOneAPI() = default;
+
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+    param_.UpdateAllowUnknown(args);
+
+    cl::sycl::default_selector selector;
+    qu_ = cl::sycl::queue(selector);
+  }
+
+  void GetGradient(const HostDeviceVector<bst_float>& preds,
+                   const MetaInfo &info,
+                   int iter,
+                   HostDeviceVector<GradientPair>* out_gpair) override {
+    if (info.labels_.Size() == 0U) {
+      LOG(WARNING) << "Label set is empty.";
+    }
+    CHECK_EQ(preds.Size(), info.labels_.Size())
+        << " " << "labels are not correctly provided"
+        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size() << ", "
+        << "Loss: " << Loss::Name();
+
+    size_t const ndata = preds.Size();
+    out_gpair->Resize(ndata);
+
+    // TODO: add label_correct check
+    label_correct_.Resize(1);
+    label_correct_.Fill(1);
+
+    bool is_null_weight = info.weights_.Size() == 0;
+
+    cl::sycl::buffer<bst_float, 1> preds_buf(preds.HostPointer(), preds.Size());
+    cl::sycl::buffer<bst_float, 1> labels_buf(info.labels_.HostPointer(), info.labels_.Size());
+    cl::sycl::buffer<GradientPair, 1> out_gpair_buf(out_gpair->HostPointer(), out_gpair->Size());
+    cl::sycl::buffer<bst_float, 1> weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(),
+                                               is_null_weight ? 1 : info.weights_.Size());
+
+	cl::sycl::buffer<int, 1> additional_input_buf(1);
+	{
+		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>();
+		additional_input_acc[0] = 1; // Fill the label_correct flag
+	}
+
+    auto scale_pos_weight = param_.scale_pos_weight;
+    if (!is_null_weight) {
+      CHECK_EQ(info.weights_.Size(), ndata)
+        << "Number of weights should be equal to number of data points.";
+    }
+
+    qu_.submit([&](cl::sycl::handler& cgh) {
+      auto preds_acc            = preds_buf.get_access<cl::sycl::access::mode::read>(cgh);
+      auto labels_acc           = labels_buf.get_access<cl::sycl::access::mode::read>(cgh);
+      auto weights_acc          = weights_buf.get_access<cl::sycl::access::mode::read>(cgh);
+      auto out_gpair_acc        = out_gpair_buf.get_access<cl::sycl::access::mode::write>(cgh);
+      auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>(cgh);
+      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
+        int idx = pid[0];
+        bst_float p = Loss::PredTransform(preds_acc[idx]);
+        bst_float w = is_null_weight ? 1.0f : weights_acc[idx];
+        bst_float label = labels_acc[idx];
+        if (label == 1.0f) {
+          w *= scale_pos_weight;
+        }
+        if (!Loss::CheckLabel(label)) {
+          // If there is an incorrect label, the host code will know.
+          additional_input_acc[0] = 0;
+        }
+        out_gpair_acc[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
+                                          Loss::SecondOrderGradient(p, label) * w);
+      });
+    }).wait();
+
+    int flag = 1;
+	{
+		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::read>();
+		flag = additional_input_acc[0];
+	}
+
+    if (flag == 0) {
+      LOG(FATAL) << Loss::LabelErrorMsg();
+    }
+  
+  }
+
+ public:
+  const char* DefaultEvalMetric() const override {
+    return Loss::DefaultEvalMetric();
+  }
+
+  void PredTransform(HostDeviceVector<float> *io_preds) override {
+    size_t const ndata = io_preds->Size();
+
+    cl::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
+
+    qu_.submit([&](cl::sycl::handler& cgh) {
+      auto io_preds_acc = io_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
+        int idx = pid[0];
+        io_preds_acc[idx] = Loss::PredTransform(io_preds_acc[idx]);
+      });
+    }).wait();
+  }
+
+  float ProbToMargin(float base_score) const override {
+    return Loss::ProbToMargin(base_score);
+  }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String(Loss::Name());
+    out["reg_loss_param"] = ToJson(param_);
+  }
+
+  void LoadConfig(Json const& in) override {
+    FromJson(in["reg_loss_param"], &param_);
+  }
+
+ protected:
+  RegLossParamOneAPI param_;
+
+  cl::sycl::queue qu_;
+};
+
+// register the objective functions
+DMLC_REGISTER_PARAMETER(RegLossParamOneAPI);
+
+// TODO: Find a better way to dispatch names of DPC++ kernels with various template parameters of loss function
+XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegressionOneAPI, LinearSquareLossOneAPI::Name())
+.describe("Regression with squared error with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<LinearSquareLossOneAPI>(); });
+XGBOOST_REGISTER_OBJECTIVE(SquareLogErrorOneAPI, SquaredLogErrorOneAPI::Name())
+.describe("Regression with root mean squared logarithmic error with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<SquaredLogErrorOneAPI>(); });
+XGBOOST_REGISTER_OBJECTIVE(LogisticRegressionOneAPI, LogisticRegressionOneAPI::Name())
+.describe("Logistic regression for probability regression task with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<LogisticRegressionOneAPI>(); });
+XGBOOST_REGISTER_OBJECTIVE(LogisticClassificationOneAPI, LogisticClassificationOneAPI::Name())
+.describe("Logistic regression for binary classification task with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<LogisticClassificationOneAPI>(); });
+XGBOOST_REGISTER_OBJECTIVE(LogisticRawOneAPI, LogisticRawOneAPI::Name())
+.describe("Logistic regression for classification, output score "
+          "before logistic transformation with DPC++ backend.")
+.set_body([]() { return new RegLossObjOneAPI<LogisticRawOneAPI>(); });
+
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 9a9c162d2..df151ce9a 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -1,391 +1,391 @@
-/*!
- * Copyright 2021-2022 by Contributors
- * \file row_set.h
- * \brief Quick Utility to compute subset of rows
- * \author Philip Cho, Tianqi Chen
- */
-#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
-#define XGBOOST_COMMON_PARTITION_BUILDER_H_
-
-#include <xgboost/data.h>
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "../tree/hist/expand_entry.h"
-#include "categorical.h"
-#include "column_matrix.h"
-#include "xgboost/context.h"
-#include "xgboost/tree_model.h"
-
-namespace xgboost {
-namespace common {
-
-// The builder is required for samples partition to left and rights children for set of nodes
-// Responsible for:
-// 1) Effective memory allocation for intermediate results for multi-thread work
-// 2) Merging partial results produced by threads into original row set (row_set_collection_)
-// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
-template<size_t BlockSize>
-class PartitionBuilder {
-  using BitVector = RBitField8;
-
- public:
-  template<typename Func>
-  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
-    left_right_nodes_sizes_.resize(n_nodes);
-    blocks_offsets_.resize(n_nodes+1);
-
-    blocks_offsets_[0] = 0;
-    for (size_t i = 1; i < n_nodes+1; ++i) {
-      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
-    }
-
-    if (n_tasks > max_n_tasks_) {
-      mem_blocks_.resize(n_tasks);
-      max_n_tasks_ = n_tasks;
-    }
-  }
-
-  // split row indexes (rid_span) to 2 parts (left_part, right_part) depending
-  // on comparison of indexes values (idx_span) and split point (split_cond)
-  // Handle dense columns
-  // Analog of std::stable_partition, but in no-inplace manner
-  template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
-  inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
-                                                   common::Span<const size_t> row_indices,
-                                                   common::Span<size_t> left_part,
-                                                   common::Span<size_t> right_part,
-                                                   size_t base_rowid, Predicate&& pred) {
-    auto& column = *p_column;
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
-
-    auto p_row_indices = row_indices.data();
-    auto n_samples = row_indices.size();
-
-    for (size_t i = 0; i < n_samples; ++i) {
-      auto rid = p_row_indices[i];
-      const int32_t bin_id = column[rid - base_rowid];
-      if (any_missing && bin_id == ColumnType::kMissingId) {
-        if (default_left) {
-          p_left_part[nleft_elems++] = rid;
-        } else {
-          p_right_part[nright_elems++] = rid;
-        }
-      } else {
-        if (pred(rid, bin_id)) {
-          p_left_part[nleft_elems++] = rid;
-        } else {
-          p_right_part[nright_elems++] = rid;
-        }
-      }
-    }
-
-    return {nleft_elems, nright_elems};
-  }
-
-  template <typename Pred>
-  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
-                                                        common::Span<size_t> left_part,
-                                                        common::Span<size_t> right_part,
-                                                        Pred pred) {
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
-    for (auto row_id : ridx) {
-      if (pred(row_id)) {
-        p_left_part[nleft_elems++] = row_id;
-      } else {
-        p_right_part[nright_elems++] = row_id;
-      }
-    }
-    return {nleft_elems, nright_elems};
-  }
-
-  template <typename BinIdxType, bool any_missing, bool any_cat>
-  void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
-                 const common::Range1d range,
-                 const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
-                 const common::ColumnMatrix& column_matrix,
-                 const RegTree& tree, const size_t* rid) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
-    bool default_left = tree[nid].DefaultLeft();
-    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
-    auto node_cats = tree.NodeCats(nid);
-    auto const& cut_values = gmat.cut.Values();
-
-    auto pred_hist = [&](auto ridx, auto bin_id) {
-      if (any_cat && is_cat) {
-        auto gidx = gmat.GetGindex(ridx, fid);
-        bool go_left = default_left;
-        if (gidx > -1) {
-          go_left = Decision(node_cats, cut_values[gidx]);
-        }
-        return go_left;
-      } else {
-        return bin_id <= split_cond;
-      }
-    };
-
-    auto pred_approx = [&](auto ridx) {
-      auto gidx = gmat.GetGindex(ridx, fid);
-      bool go_left = default_left;
-      if (gidx > -1) {
-        if (is_cat) {
-          go_left = Decision(node_cats, cut_values[gidx]);
-        } else {
-          go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
-        }
-      }
-      return go_left;
-    };
-
-    std::pair<size_t, size_t> child_nodes_sizes;
-    if (!column_matrix.IsInitialized()) {
-      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
-    } else {
-      if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
-        auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
-        if (default_left) {
-          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
-                                                                 gmat.base_rowid, pred_hist);
-        } else {
-          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
-                                                                  gmat.base_rowid, pred_hist);
-        }
-      } else {
-        CHECK_EQ(any_missing, true);
-        auto column =
-            column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
-        if (default_left) {
-          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
-                                                                 gmat.base_rowid, pred_hist);
-        } else {
-          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
-                                                                  gmat.base_rowid, pred_hist);
-        }
-      }
-    }
-
-    const size_t n_left  = child_nodes_sizes.first;
-    const size_t n_right = child_nodes_sizes.second;
-
-    SetNLeftElems(node_in_set, range.begin(), n_left);
-    SetNRightElems(node_in_set, range.begin(), n_right);
-  }
-
-  /**
-   * @brief When data is split by column, we don't have all the features locally on the current
-   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
-   * to go right, or if the feature value used for the split is missing.
-   */
-  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
-                const common::Range1d range, GHistIndexMatrix const& gmat,
-                const common::ColumnMatrix& column_matrix,
-                const RegTree& tree, const size_t* rid,
-                BitVector* decision_bits, BitVector* missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
-    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
-    auto node_cats = tree.NodeCats(nid);
-    auto const& cut_values = gmat.cut.Values();
-
-    if (!column_matrix.IsInitialized()) {
-      for (auto row_id : rid_span) {
-        auto gidx = gmat.GetGindex(row_id, fid);
-        if (gidx > -1) {
-          bool go_left = false;
-          if (is_cat) {
-            go_left = Decision(node_cats, cut_values[gidx]);
-          } else {
-            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
-          }
-          if (go_left) {
-            decision_bits->Set(row_id - gmat.base_rowid);
-          }
-        } else {
-          missing_bits->Set(row_id - gmat.base_rowid);
-        }
-      }
-    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
-    }
-  }
-
-  /**
-   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
-   * use them to partition the rows accordingly.
-   */
-  void PartitionByMask(const size_t node_in_set,
-                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
-                       const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
-                       const size_t* rid, BitVector const& decision_bits,
-                       BitVector const& missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
-    std::size_t nid = nodes[node_in_set].nid;
-    bool default_left = tree[nid].DefaultLeft();
-
-    auto pred_approx = [&](auto ridx) {
-      bool go_left = default_left;
-      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
-      if (!is_missing) {
-        go_left = decision_bits.Check(ridx - gmat.base_rowid);
-      }
-      return go_left;
-    };
-
-    std::pair<size_t, size_t> child_nodes_sizes;
-    if (!column_matrix.IsInitialized()) {
-      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
-    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
-    }
-
-    const size_t n_left  = child_nodes_sizes.first;
-    const size_t n_right = child_nodes_sizes.second;
-
-    SetNLeftElems(node_in_set, range.begin(), n_left);
-    SetNRightElems(node_in_set, range.begin(), n_right);
-  }
-
-  // allocate thread local memory, should be called for each specific task
-  void AllocateForTask(size_t id) {
-    if (mem_blocks_[id].get() == nullptr) {
-      BlockInfo* local_block_ptr = new BlockInfo;
-      CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
-      mem_blocks_[id].reset(local_block_ptr);
-    }
-  }
-
-  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
-    const size_t task_idx = GetTaskIdx(nid, begin);
-    return { mem_blocks_.at(task_idx)->Left(), end - begin };
-  }
-
-  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
-    const size_t task_idx = GetTaskIdx(nid, begin);
-    return { mem_blocks_.at(task_idx)->Right(), end - begin };
-  }
-
-  void SetNLeftElems(int nid, size_t begin, size_t n_left) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-    mem_blocks_.at(task_idx)->n_left = n_left;
-  }
-
-  void SetNRightElems(int nid, size_t begin, size_t n_right) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-    mem_blocks_.at(task_idx)->n_right = n_right;
-  }
-
-
-  size_t GetNLeftElems(int nid) const {
-    return left_right_nodes_sizes_[nid].first;
-  }
-
-  size_t GetNRightElems(int nid) const {
-    return left_right_nodes_sizes_[nid].second;
-  }
-
-  // Each thread has partial results for some set of tree-nodes
-  // The function decides order of merging partial results into final row set
-  void CalculateRowOffsets() {
-    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
-      size_t n_left = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
-        mem_blocks_[j]->n_offset_left = n_left;
-        n_left += mem_blocks_[j]->n_left;
-      }
-      size_t n_right = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
-        mem_blocks_[j]->n_offset_right = n_left + n_right;
-        n_right += mem_blocks_[j]->n_right;
-      }
-      left_right_nodes_sizes_[i] = {n_left, n_right};
-    }
-  }
-
-  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-
-    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
-    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
-
-    const size_t* left = mem_blocks_[task_idx]->Left();
-    const size_t* right = mem_blocks_[task_idx]->Right();
-
-    std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
-    std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
-  }
-
-  size_t GetTaskIdx(int nid, size_t begin) {
-    return blocks_offsets_[nid] + begin / BlockSize;
-  }
-
-  // Copy row partitions into global cache for reuse in objective
-  template <typename Sampledp>
-  void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
-                     std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
-    auto& h_pos = *p_position;
-    h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
-
-    auto p_begin = row_set.Data()->data();
-    ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
-      auto const& node = row_set[i];
-      if (node.node_id < 0) {
-        return;
-      }
-      CHECK(tree[node.node_id].IsLeaf());
-      if (node.begin) {  // guard for empty node.
-        size_t ptr_offset = node.end - p_begin;
-        CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
-        for (auto idx = node.begin; idx != node.end; ++idx) {
-          h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
-        }
-      }
-    });
-  }
-
- protected:
-  struct BlockInfo{
-    size_t n_left;
-    size_t n_right;
-
-    size_t n_offset_left;
-    size_t n_offset_right;
-
-    size_t* Left() {
-      return &left_data_[0];
-    }
-
-    size_t* Right() {
-      return &right_data_[0];
-    }
-   private:
-    size_t left_data_[BlockSize];
-    size_t right_data_[BlockSize];
-  };
-  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
-  std::vector<size_t> blocks_offsets_;
-  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
-  size_t max_n_tasks_ = 0;
-};
-
-}  // namespace common
-}  // namespace xgboost
-
-#endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
+/*!
+ * Copyright 2021-2022 by Contributors
+ * \file row_set.h
+ * \brief Quick Utility to compute subset of rows
+ * \author Philip Cho, Tianqi Chen
+ */
+#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
+#define XGBOOST_COMMON_PARTITION_BUILDER_H_
+
+#include <xgboost/data.h>
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "../tree/hist/expand_entry.h"
+#include "categorical.h"
+#include "column_matrix.h"
+#include "xgboost/context.h"
+#include "xgboost/tree_model.h"
+
+namespace xgboost {
+namespace common {
+
+// The builder is required for samples partition to left and rights children for set of nodes
+// Responsible for:
+// 1) Effective memory allocation for intermediate results for multi-thread work
+// 2) Merging partial results produced by threads into original row set (row_set_collection_)
+// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
+template<size_t BlockSize>
+class PartitionBuilder {
+  using BitVector = RBitField8;
+
+ public:
+  template<typename Func>
+  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
+    left_right_nodes_sizes_.resize(n_nodes);
+    blocks_offsets_.resize(n_nodes+1);
+
+    blocks_offsets_[0] = 0;
+    for (size_t i = 1; i < n_nodes+1; ++i) {
+      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTask(i-1);
+    }
+
+    if (n_tasks > max_n_tasks_) {
+      mem_blocks_.resize(n_tasks);
+      max_n_tasks_ = n_tasks;
+    }
+  }
+
+  // split row indexes (rid_span) to 2 parts (left_part, right_part) depending
+  // on comparison of indexes values (idx_span) and split point (split_cond)
+  // Handle dense columns
+  // Analog of std::stable_partition, but in no-inplace manner
+  template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
+  inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
+                                                   common::Span<const size_t> row_indices,
+                                                   common::Span<size_t> left_part,
+                                                   common::Span<size_t> right_part,
+                                                   size_t base_rowid, Predicate&& pred) {
+    auto& column = *p_column;
+    size_t* p_left_part = left_part.data();
+    size_t* p_right_part = right_part.data();
+    size_t nleft_elems = 0;
+    size_t nright_elems = 0;
+
+    auto p_row_indices = row_indices.data();
+    auto n_samples = row_indices.size();
+
+    for (size_t i = 0; i < n_samples; ++i) {
+      auto rid = p_row_indices[i];
+      const int32_t bin_id = column[rid - base_rowid];
+      if (any_missing && bin_id == ColumnType::kMissingId) {
+        if (default_left) {
+          p_left_part[nleft_elems++] = rid;
+        } else {
+          p_right_part[nright_elems++] = rid;
+        }
+      } else {
+        if (pred(rid, bin_id)) {
+          p_left_part[nleft_elems++] = rid;
+        } else {
+          p_right_part[nright_elems++] = rid;
+        }
+      }
+    }
+
+    return {nleft_elems, nright_elems};
+  }
+
+  template <typename Pred>
+  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
+                                                        common::Span<size_t> left_part,
+                                                        common::Span<size_t> right_part,
+                                                        Pred pred) {
+    size_t* p_left_part = left_part.data();
+    size_t* p_right_part = right_part.data();
+    size_t nleft_elems = 0;
+    size_t nright_elems = 0;
+    for (auto row_id : ridx) {
+      if (pred(row_id)) {
+        p_left_part[nleft_elems++] = row_id;
+      } else {
+        p_right_part[nright_elems++] = row_id;
+      }
+    }
+    return {nleft_elems, nright_elems};
+  }
+
+  template <typename BinIdxType, bool any_missing, bool any_cat>
+  void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+                 const common::Range1d range,
+                 const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
+                 const common::ColumnMatrix& column_matrix,
+                 const RegTree& tree, const size_t* rid) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree[nid].SplitIndex();
+    bool default_left = tree[nid].DefaultLeft();
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    auto pred_hist = [&](auto ridx, auto bin_id) {
+      if (any_cat && is_cat) {
+        auto gidx = gmat.GetGindex(ridx, fid);
+        bool go_left = default_left;
+        if (gidx > -1) {
+          go_left = Decision(node_cats, cut_values[gidx]);
+        }
+        return go_left;
+      } else {
+        return bin_id <= split_cond;
+      }
+    };
+
+    auto pred_approx = [&](auto ridx) {
+      auto gidx = gmat.GetGindex(ridx, fid);
+      bool go_left = default_left;
+      if (gidx > -1) {
+        if (is_cat) {
+          go_left = Decision(node_cats, cut_values[gidx]);
+        } else {
+          go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+        }
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
+        auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
+        if (default_left) {
+          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
+                                                                 gmat.base_rowid, pred_hist);
+        } else {
+          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
+                                                                  gmat.base_rowid, pred_hist);
+        }
+      } else {
+        CHECK_EQ(any_missing, true);
+        auto column =
+            column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
+        if (default_left) {
+          child_nodes_sizes = PartitionKernel<true, any_missing>(&column, rid_span, left, right,
+                                                                 gmat.base_rowid, pred_hist);
+        } else {
+          child_nodes_sizes = PartitionKernel<false, any_missing>(&column, rid_span, left, right,
+                                                                  gmat.base_rowid, pred_hist);
+        }
+      }
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
+  /**
+   * @brief When data is split by column, we don't have all the features locally on the current
+   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
+   * to go right, or if the feature value used for the split is missing.
+   */
+  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::ColumnMatrix& column_matrix,
+                const RegTree& tree, const size_t* rid,
+                BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree[nid].SplitIndex();
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    if (!column_matrix.IsInitialized()) {
+      for (auto row_id : rid_span) {
+        auto gidx = gmat.GetGindex(row_id, fid);
+        if (gidx > -1) {
+          bool go_left = false;
+          if (is_cat) {
+            go_left = Decision(node_cats, cut_values[gidx]);
+          } else {
+            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+          }
+          if (go_left) {
+            decision_bits->Set(row_id - gmat.base_rowid);
+          }
+        } else {
+          missing_bits->Set(row_id - gmat.base_rowid);
+        }
+      }
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+  }
+
+  /**
+   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
+   * use them to partition the rows accordingly.
+   */
+  void PartitionByMask(const size_t node_in_set,
+                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
+                       const common::Range1d range, GHistIndexMatrix const& gmat,
+                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                       const size_t* rid, BitVector const& decision_bits,
+                       BitVector const& missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bool default_left = tree[nid].DefaultLeft();
+
+    auto pred_approx = [&](auto ridx) {
+      bool go_left = default_left;
+      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
+      if (!is_missing) {
+        go_left = decision_bits.Check(ridx - gmat.base_rowid);
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
+  // allocate thread local memory, should be called for each specific task
+  void AllocateForTask(size_t id) {
+    if (mem_blocks_[id].get() == nullptr) {
+      BlockInfo* local_block_ptr = new BlockInfo;
+      CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
+      mem_blocks_[id].reset(local_block_ptr);
+    }
+  }
+
+  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx)->Left(), end - begin };
+  }
+
+  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx)->Right(), end - begin };
+  }
+
+  void SetNLeftElems(int nid, size_t begin, size_t n_left) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx)->n_left = n_left;
+  }
+
+  void SetNRightElems(int nid, size_t begin, size_t n_right) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx)->n_right = n_right;
+  }
+
+
+  size_t GetNLeftElems(int nid) const {
+    return left_right_nodes_sizes_[nid].first;
+  }
+
+  size_t GetNRightElems(int nid) const {
+    return left_right_nodes_sizes_[nid].second;
+  }
+
+  // Each thread has partial results for some set of tree-nodes
+  // The function decides order of merging partial results into final row set
+  void CalculateRowOffsets() {
+    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
+      size_t n_left = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
+        mem_blocks_[j]->n_offset_left = n_left;
+        n_left += mem_blocks_[j]->n_left;
+      }
+      size_t n_right = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
+        mem_blocks_[j]->n_offset_right = n_left + n_right;
+        n_right += mem_blocks_[j]->n_right;
+      }
+      left_right_nodes_sizes_[i] = {n_left, n_right};
+    }
+  }
+
+  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+
+    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
+    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
+
+    const size_t* left = mem_blocks_[task_idx]->Left();
+    const size_t* right = mem_blocks_[task_idx]->Right();
+
+    std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
+    std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
+  }
+
+  size_t GetTaskIdx(int nid, size_t begin) {
+    return blocks_offsets_[nid] + begin / BlockSize;
+  }
+
+  // Copy row partitions into global cache for reuse in objective
+  template <typename Sampledp>
+  void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
+                     std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
+    auto& h_pos = *p_position;
+    h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
+
+    auto p_begin = row_set.Data()->data();
+    ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
+      auto const& node = row_set[i];
+      if (node.node_id < 0) {
+        return;
+      }
+      CHECK(tree[node.node_id].IsLeaf());
+      if (node.begin) {  // guard for empty node.
+        size_t ptr_offset = node.end - p_begin;
+        CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
+        for (auto idx = node.begin; idx != node.end; ++idx) {
+          h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
+        }
+      }
+    });
+  }
+
+ protected:
+  struct BlockInfo{
+    size_t n_left;
+    size_t n_right;
+
+    size_t n_offset_left;
+    size_t n_offset_right;
+
+    size_t* Left() {
+      return &left_data_[0];
+    }
+
+    size_t* Right() {
+      return &right_data_[0];
+    }
+   private:
+    size_t left_data_[BlockSize];
+    size_t right_data_[BlockSize];
+  };
+  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
+  std::vector<size_t> blocks_offsets_;
+  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
+  size_t max_n_tasks_ = 0;
+};
+
+}  // namespace common
+}  // namespace xgboost
+
+#endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
diff --git a/src/tree/driver.h b/src/tree/driver.h
index a4a0dd4a6..c3189a70c 100644
--- a/src/tree/driver.h
+++ b/src/tree/driver.h
@@ -1,111 +1,111 @@
-/*!
- * Copyright 2021 by XGBoost Contributors
- */
-#ifndef XGBOOST_TREE_DRIVER_H_
-#define XGBOOST_TREE_DRIVER_H_
-#include <xgboost/span.h>
-#include <queue>
-#include <vector>
-#include "./param.h"
-
-namespace xgboost {
-namespace tree {
-
-template <typename ExpandEntryT>
-inline bool DepthWise(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
-  return lhs.GetNodeId() > rhs.GetNodeId();  // favor small depth
-}
-
-template <typename ExpandEntryT>
-inline bool LossGuide(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
-  if (lhs.GetLossChange() == rhs.GetLossChange()) {
-    return lhs.GetNodeId() > rhs.GetNodeId();  // favor small timestamp
-  } else {
-    return lhs.GetLossChange() < rhs.GetLossChange();  // favor large loss_chg
-  }
-}
-
-// Drives execution of tree building on device
-template <typename ExpandEntryT>
-class Driver {
-  using ExpandQueue =
-      std::priority_queue<ExpandEntryT, std::vector<ExpandEntryT>,
-                          std::function<bool(ExpandEntryT, ExpandEntryT)>>;
-
- public:
-  explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256)
-      : param_(param),
-        max_node_batch_size_(max_node_batch_size),
-        queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT>
-                                                           : LossGuide<ExpandEntryT>) {}
-  template <typename EntryIterT>
-  void Push(EntryIterT begin, EntryIterT end) {
-    for (auto it = begin; it != end; ++it) {
-      const ExpandEntryT& e = *it;
-      if (e.split.loss_chg > kRtEps) {
-        queue_.push(e);
-      }
-    }
-  }
-  void Push(const std::vector<ExpandEntryT> &entries) {
-    this->Push(entries.begin(), entries.end());
-  }
-  void Push(ExpandEntryT const& e) { queue_.push(e); }
-
-  bool IsEmpty() {
-    return queue_.empty();
-  }
-
-  // Can a child of this entry still be expanded?
-  // can be used to avoid extra work
-  bool IsChildValid(ExpandEntryT const& parent_entry) {
-    if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
-    if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
-    return true;
-  }
-
-  // Return the set of nodes to be expanded
-  // This set has no dependencies between entries so they may be expanded in
-  // parallel or asynchronously
-  std::vector<ExpandEntryT> Pop() {
-    if (queue_.empty()) return {};
-    // Return a single entry for loss guided mode
-    if (param_.grow_policy == TrainParam::kLossGuide) {
-      ExpandEntryT e = queue_.top();
-      queue_.pop();
-
-      if (e.IsValid(param_, num_leaves_)) {
-        num_leaves_++;
-        return {e};
-      } else {
-        return {};
-      }
-    }
-    // Return nodes on same level for depth wise
-    std::vector<ExpandEntryT> result;
-    ExpandEntryT e = queue_.top();
-    int level = e.depth;
-    while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) {
-      queue_.pop();
-      if (e.IsValid(param_, num_leaves_)) {
-        num_leaves_++;
-        result.emplace_back(e);
-      }
-
-      if (!queue_.empty()) {
-        e = queue_.top();
-      }
-    }
-    return result;
-  }
-
- private:
-  TrainParam param_;
-  bst_node_t num_leaves_ = 1;
-  std::size_t max_node_batch_size_;
-  ExpandQueue queue_;
-};
-}  // namespace tree
-}  // namespace xgboost
-
-#endif  // XGBOOST_TREE_DRIVER_H_
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ */
+#ifndef XGBOOST_TREE_DRIVER_H_
+#define XGBOOST_TREE_DRIVER_H_
+#include <xgboost/span.h>
+#include <queue>
+#include <vector>
+#include "./param.h"
+
+namespace xgboost {
+namespace tree {
+
+template <typename ExpandEntryT>
+inline bool DepthWise(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
+  return lhs.GetNodeId() > rhs.GetNodeId();  // favor small depth
+}
+
+template <typename ExpandEntryT>
+inline bool LossGuide(const ExpandEntryT& lhs, const ExpandEntryT& rhs) {
+  if (lhs.GetLossChange() == rhs.GetLossChange()) {
+    return lhs.GetNodeId() > rhs.GetNodeId();  // favor small timestamp
+  } else {
+    return lhs.GetLossChange() < rhs.GetLossChange();  // favor large loss_chg
+  }
+}
+
+// Drives execution of tree building on device
+template <typename ExpandEntryT>
+class Driver {
+  using ExpandQueue =
+      std::priority_queue<ExpandEntryT, std::vector<ExpandEntryT>,
+                          std::function<bool(ExpandEntryT, ExpandEntryT)>>;
+
+ public:
+  explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256)
+      : param_(param),
+        max_node_batch_size_(max_node_batch_size),
+        queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT>
+                                                           : LossGuide<ExpandEntryT>) {}
+  template <typename EntryIterT>
+  void Push(EntryIterT begin, EntryIterT end) {
+    for (auto it = begin; it != end; ++it) {
+      const ExpandEntryT& e = *it;
+      if (e.split.loss_chg > kRtEps) {
+        queue_.push(e);
+      }
+    }
+  }
+  void Push(const std::vector<ExpandEntryT> &entries) {
+    this->Push(entries.begin(), entries.end());
+  }
+  void Push(ExpandEntryT const& e) { queue_.push(e); }
+
+  bool IsEmpty() {
+    return queue_.empty();
+  }
+
+  // Can a child of this entry still be expanded?
+  // can be used to avoid extra work
+  bool IsChildValid(ExpandEntryT const& parent_entry) {
+    if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
+    if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
+    return true;
+  }
+
+  // Return the set of nodes to be expanded
+  // This set has no dependencies between entries so they may be expanded in
+  // parallel or asynchronously
+  std::vector<ExpandEntryT> Pop() {
+    if (queue_.empty()) return {};
+    // Return a single entry for loss guided mode
+    if (param_.grow_policy == TrainParam::kLossGuide) {
+      ExpandEntryT e = queue_.top();
+      queue_.pop();
+
+      if (e.IsValid(param_, num_leaves_)) {
+        num_leaves_++;
+        return {e};
+      } else {
+        return {};
+      }
+    }
+    // Return nodes on same level for depth wise
+    std::vector<ExpandEntryT> result;
+    ExpandEntryT e = queue_.top();
+    int level = e.depth;
+    while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) {
+      queue_.pop();
+      if (e.IsValid(param_, num_leaves_)) {
+        num_leaves_++;
+        result.emplace_back(e);
+      }
+
+      if (!queue_.empty()) {
+        e = queue_.top();
+      }
+    }
+    return result;
+  }
+
+ private:
+  TrainParam param_;
+  bst_node_t num_leaves_ = 1;
+  std::size_t max_node_batch_size_;
+  ExpandQueue queue_;
+};
+}  // namespace tree
+}  // namespace xgboost
+
+#endif  // XGBOOST_TREE_DRIVER_H_
diff --git a/tests/cpp/common/test_partition_builder.cc b/tests/cpp/common/test_partition_builder.cc
index 093f87708..4e6d800a7 100644
--- a/tests/cpp/common/test_partition_builder.cc
+++ b/tests/cpp/common/test_partition_builder.cc
@@ -1,79 +1,79 @@
-#include <gtest/gtest.h>
-#include <vector>
-#include <string>
-#include <utility>
-
-#include "../../../src/common/row_set.h"
-#include "../../../src/common/partition_builder.h"
-#include "../helpers.h"
-
-namespace xgboost {
-namespace common {
-
-TEST(PartitionBuilder, BasicTest) {
-  constexpr size_t kBlockSize = 16;
-  constexpr size_t kNodes = 5;
-  constexpr size_t kTasks = 3 + 5 + 10 + 1 + 2;
-
-  std::vector<size_t> tasks = { 3, 5, 10, 1, 2 };
-
-  PartitionBuilder<kBlockSize> builder;
-  builder.Init(kTasks, kNodes, [&](size_t i) {
-    return tasks[i];
-  });
-
-  std::vector<size_t> rows_for_left_node = { 2, 12, 0, 16, 8 };
-
-  for(size_t nid = 0; nid < kNodes; ++nid) {
-    size_t value_left = 0;
-    size_t value_right = 0;
-
-    size_t left_total = tasks[nid] * rows_for_left_node[nid];
-
-    for(size_t j = 0; j < tasks[nid]; ++j) {
-      size_t begin = kBlockSize*j;
-      size_t end = kBlockSize*(j+1);
-      const size_t id = builder.GetTaskIdx(nid, begin);
-      builder.AllocateForTask(id);
-
-      auto left  = builder.GetLeftBuffer(nid, begin, end);
-      auto right = builder.GetRightBuffer(nid, begin, end);
-
-      size_t n_left   = rows_for_left_node[nid];
-      size_t n_right = kBlockSize - rows_for_left_node[nid];
-
-      for(size_t i = 0; i < n_left; i++) {
-        left[i] = value_left++;
-      }
-
-      for(size_t i = 0; i < n_right; i++) {
-        right[i] = left_total + value_right++;
-      }
-
-      builder.SetNLeftElems(nid, begin, n_left);
-      builder.SetNRightElems(nid, begin, n_right);
-    }
-  }
-  builder.CalculateRowOffsets();
-
-  std::vector<size_t> v(*std::max_element(tasks.begin(), tasks.end()) * kBlockSize);
-
-  for(size_t nid = 0; nid < kNodes; ++nid) {
-
-    for(size_t j = 0; j < tasks[nid]; ++j) {
-      builder.MergeToArray(nid, kBlockSize*j, v.data());
-    }
-
-    for(size_t j = 0; j < tasks[nid] * kBlockSize; ++j) {
-      ASSERT_EQ(v[j], j);
-    }
-    size_t n_left  = builder.GetNLeftElems(nid);
-    size_t n_right = builder.GetNRightElems(nid);
-
-    ASSERT_EQ(n_left, rows_for_left_node[nid] * tasks[nid]);
-    ASSERT_EQ(n_right, (kBlockSize - rows_for_left_node[nid]) * tasks[nid]);
-  }
-}
-
-}  // namespace common
-}  // namespace xgboost
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include <utility>
+
+#include "../../../src/common/row_set.h"
+#include "../../../src/common/partition_builder.h"
+#include "../helpers.h"
+
+namespace xgboost {
+namespace common {
+
+TEST(PartitionBuilder, BasicTest) {
+  constexpr size_t kBlockSize = 16;
+  constexpr size_t kNodes = 5;
+  constexpr size_t kTasks = 3 + 5 + 10 + 1 + 2;
+
+  std::vector<size_t> tasks = { 3, 5, 10, 1, 2 };
+
+  PartitionBuilder<kBlockSize> builder;
+  builder.Init(kTasks, kNodes, [&](size_t i) {
+    return tasks[i];
+  });
+
+  std::vector<size_t> rows_for_left_node = { 2, 12, 0, 16, 8 };
+
+  for(size_t nid = 0; nid < kNodes; ++nid) {
+    size_t value_left = 0;
+    size_t value_right = 0;
+
+    size_t left_total = tasks[nid] * rows_for_left_node[nid];
+
+    for(size_t j = 0; j < tasks[nid]; ++j) {
+      size_t begin = kBlockSize*j;
+      size_t end = kBlockSize*(j+1);
+      const size_t id = builder.GetTaskIdx(nid, begin);
+      builder.AllocateForTask(id);
+
+      auto left  = builder.GetLeftBuffer(nid, begin, end);
+      auto right = builder.GetRightBuffer(nid, begin, end);
+
+      size_t n_left   = rows_for_left_node[nid];
+      size_t n_right = kBlockSize - rows_for_left_node[nid];
+
+      for(size_t i = 0; i < n_left; i++) {
+        left[i] = value_left++;
+      }
+
+      for(size_t i = 0; i < n_right; i++) {
+        right[i] = left_total + value_right++;
+      }
+
+      builder.SetNLeftElems(nid, begin, n_left);
+      builder.SetNRightElems(nid, begin, n_right);
+    }
+  }
+  builder.CalculateRowOffsets();
+
+  std::vector<size_t> v(*std::max_element(tasks.begin(), tasks.end()) * kBlockSize);
+
+  for(size_t nid = 0; nid < kNodes; ++nid) {
+
+    for(size_t j = 0; j < tasks[nid]; ++j) {
+      builder.MergeToArray(nid, kBlockSize*j, v.data());
+    }
+
+    for(size_t j = 0; j < tasks[nid] * kBlockSize; ++j) {
+      ASSERT_EQ(v[j], j);
+    }
+    size_t n_left  = builder.GetNLeftElems(nid);
+    size_t n_right = builder.GetNRightElems(nid);
+
+    ASSERT_EQ(n_left, rows_for_left_node[nid] * tasks[nid]);
+    ASSERT_EQ(n_right, (kBlockSize - rows_for_left_node[nid]) * tasks[nid]);
+  }
+}
+
+}  // namespace common
+}  // namespace xgboost

From a093770f36ba56250aa563eff3cfde2abbd3c0ec Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 16 Mar 2023 18:49:34 +0800
Subject: [PATCH 17/32] Partitioner for multi-target tree. (#8922)

---
 src/common/partition_builder.h             |  43 ++++-----
 src/learner.cc                             |   4 +-
 src/tree/common_row_partitioner.h          | 102 ++++++++++++---------
 tests/cpp/common/test_partition_builder.cc |  16 ++--
 tests/cpp/tree/test_approx.cc              |  73 ---------------
 tests/cpp/tree/test_common_partitioner.cc  |  93 +++++++++++++++++++
 tests/cpp/tree/test_partitioner.h          |  37 ++++++--
 tests/cpp/tree/test_quantile_hist.cc       |  49 ++++++----
 8 files changed, 239 insertions(+), 178 deletions(-)
 create mode 100644 tests/cpp/tree/test_common_partitioner.cc

diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index df151ce9a..e5e6971e5 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021-2022 by Contributors
+/**
+ * Copyright 2021-2023 by Contributors
  * \file row_set.h
  * \brief Quick Utility to compute subset of rows
  * \author Philip Cho, Tianqi Chen
@@ -10,6 +10,7 @@
 #include <xgboost/data.h>
 
 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <limits>
 #include <memory>
 #include <utility>
@@ -21,9 +22,7 @@
 #include "xgboost/context.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 // The builder is required for samples partition to left and rights children for set of nodes
 // Responsible for:
 // 1) Effective memory allocation for intermediate results for multi-thread work
@@ -109,18 +108,17 @@ class PartitionBuilder {
     return {nleft_elems, nright_elems};
   }
 
-  template <typename BinIdxType, bool any_missing, bool any_cat>
-  void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
-                 const common::Range1d range,
-                 const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
-                 const common::ColumnMatrix& column_matrix,
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
+  void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
+                 const common::Range1d range, const bst_bin_t split_cond,
+                 GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
                  const RegTree& tree, const size_t* rid) {
     common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
     common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
     common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
     std::size_t nid = nodes[node_in_set].nid;
-    bst_feature_t fid = tree[nid].SplitIndex();
-    bool default_left = tree[nid].DefaultLeft();
+    bst_feature_t fid = tree.SplitIndex(nid);
+    bool default_left = tree.DefaultLeft(nid);
     bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
     auto node_cats = tree.NodeCats(nid);
     auto const& cut_values = gmat.cut.Values();
@@ -190,10 +188,10 @@ class PartitionBuilder {
    * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
    * to go right, or if the feature value used for the split is missing.
    */
-  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+  template <typename ExpandEntry>
+  void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                 const common::Range1d range, GHistIndexMatrix const& gmat,
-                const common::ColumnMatrix& column_matrix,
-                const RegTree& tree, const size_t* rid,
+                const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
                 BitVector* decision_bits, BitVector* missing_bits) {
     common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
     std::size_t nid = nodes[node_in_set].nid;
@@ -228,8 +226,8 @@ class PartitionBuilder {
    * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
    * use them to partition the rows accordingly.
    */
-  void PartitionByMask(const size_t node_in_set,
-                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
+  template <typename ExpandEntry>
+  void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                        const common::Range1d range, GHistIndexMatrix const& gmat,
                        const common::ColumnMatrix& column_matrix, const RegTree& tree,
                        const size_t* rid, BitVector const& decision_bits,
@@ -293,11 +291,11 @@ class PartitionBuilder {
   }
 
 
-  size_t GetNLeftElems(int nid) const {
+  [[nodiscard]] std::size_t GetNLeftElems(int nid) const {
     return left_right_nodes_sizes_[nid].first;
   }
 
-  size_t GetNRightElems(int nid) const {
+  [[nodiscard]] std::size_t GetNRightElems(int nid) const {
     return left_right_nodes_sizes_[nid].second;
   }
 
@@ -349,7 +347,7 @@ class PartitionBuilder {
       if (node.node_id < 0) {
         return;
       }
-      CHECK(tree[node.node_id].IsLeaf());
+      CHECK(tree.IsLeaf(node.node_id));
       if (node.begin) {  // guard for empty node.
         size_t ptr_offset = node.end - p_begin;
         CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
@@ -384,8 +382,5 @@ class PartitionBuilder {
   std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
   size_t max_n_tasks_ = 0;
 };
-
-}  // namespace common
-}  // namespace xgboost
-
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
diff --git a/src/learner.cc b/src/learner.cc
index d91add70d..e1b5605ca 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -343,8 +343,8 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
         .add_enum("monolithic", MultiStrategy::kMonolithic)
         .set_default(MultiStrategy::kComposite)
         .describe(
-            "Strategy used for training multi-target models. `mono` means building one single tree "
-            "for all targets.");
+            "Strategy used for training multi-target models. `monolithic` means building one "
+            "single tree for all targets.");
   }
 };
 
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index a58dbb452..ba69d8921 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -1,22 +1,26 @@
-/*!
- * Copyright 2021-2022 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  * \file common_row_partitioner.h
  * \brief Common partitioner logic for hist and approx methods.
  */
 #ifndef XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
 #define XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
 
+#include <algorithm>  // std::all_of
+#include <cinttypes>  // std::uint32_t
 #include <limits>  // std::numeric_limits
 #include <vector>
 
 #include "../collective/communicator-inl.h"
+#include "../common/linalg_op.h"  // cbegin
 #include "../common/numeric.h"  // Iota
 #include "../common/partition_builder.h"
 #include "hist/expand_entry.h"  // CPUExpandEntry
+#include "xgboost/base.h"
 #include "xgboost/context.h"    // Context
+#include "xgboost/linalg.h"       // TensorView
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 
 static constexpr size_t kPartitionBlockSize = 2048;
 
@@ -34,9 +38,10 @@ class ColumnSplitHelper {
     missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
   }
 
+  template <typename ExpandEntry>
   void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
                  GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
-                 std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                 std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
     // When data is split by column, we don't have all the feature values in the local worker, so
     // we first collect all the decisions and whether the feature is missing into bit vectors.
     std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
@@ -97,17 +102,18 @@ class CommonRowPartitioner {
     }
   }
 
-  void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
+  template <typename ExpandEntry>
+  void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
                            const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions) {
     auto const& ptrs = gmat.cut.Ptrs();
     auto const& vals = gmat.cut.Values();
 
     for (std::size_t i = 0; i < nodes.size(); ++i) {
-      bst_node_t const nid = nodes[i].nid;
-      bst_feature_t const fid = tree[nid].SplitIndex();
-      const float split_pt = tree[nid].SplitCond();
-      const uint32_t lower_bound = ptrs[fid];
-      const uint32_t upper_bound = ptrs[fid + 1];
+      bst_node_t const nidx = nodes[i].nid;
+      bst_feature_t const fidx = tree.SplitIndex(nidx);
+      float const split_pt = tree.SplitCond(nidx);
+      std::uint32_t const lower_bound = ptrs[fidx];
+      std::uint32_t const upper_bound = ptrs[fidx + 1];
       bst_bin_t split_cond = -1;
       // convert floating-point split_pt into corresponding bin_id
       // split_cond = -1 indicates that split_pt is less than all known cut points
@@ -121,20 +127,22 @@ class CommonRowPartitioner {
     }
   }
 
-  void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree const* p_tree) {
+  template <typename ExpandEntry>
+  void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, RegTree const* p_tree) {
     const size_t n_nodes = nodes.size();
     for (unsigned int i = 0; i < n_nodes; ++i) {
-      const int32_t nid = nodes[i].nid;
+      const int32_t nidx = nodes[i].nid;
       const size_t n_left = partition_builder_.GetNLeftElems(i);
       const size_t n_right = partition_builder_.GetNRightElems(i);
-      CHECK_EQ((*p_tree)[nid].LeftChild() + 1, (*p_tree)[nid].RightChild());
-      row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild(),
-                                   n_left, n_right);
+      CHECK_EQ(p_tree->LeftChild(nidx) + 1, p_tree->RightChild(nidx));
+      row_set_collection_.AddSplit(nidx, p_tree->LeftChild(nidx), p_tree->RightChild(nidx), n_left,
+                                   n_right);
     }
   }
 
+  template <typename ExpandEntry>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
     auto const& column_matrix = gmat.Transpose();
     if (column_matrix.IsInitialized()) {
       if (gmat.cut.HasCategorical()) {
@@ -152,10 +160,10 @@ class CommonRowPartitioner {
     }
   }
 
-  template <bool any_cat>
+  template <bool any_cat, typename ExpandEntry>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
     if (column_matrix.AnyMissing()) {
       this->template UpdatePosition<true, any_cat>(ctx, gmat, column_matrix, nodes, p_tree);
     } else {
@@ -163,33 +171,21 @@ class CommonRowPartitioner {
     }
   }
 
-  template <bool any_missing, bool any_cat>
+  template <bool any_missing, bool any_cat, typename ExpandEntry>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
-    switch (column_matrix.GetTypeSize()) {
-      case common::kUint8BinsTypeSize:
-        this->template UpdatePosition<uint8_t, any_missing, any_cat>(ctx, gmat, column_matrix,
-                                                                     nodes, p_tree);
-        break;
-      case common::kUint16BinsTypeSize:
-        this->template UpdatePosition<uint16_t, any_missing, any_cat>(ctx, gmat, column_matrix,
-                                                                      nodes, p_tree);
-        break;
-      case common::kUint32BinsTypeSize:
-        this->template UpdatePosition<uint32_t, any_missing, any_cat>(ctx, gmat, column_matrix,
-                                                                      nodes, p_tree);
-        break;
-      default:
-        // no default behavior
-        CHECK(false) << column_matrix.GetTypeSize();
-    }
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
+    common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto t) {
+      using T = decltype(t);
+      this->template UpdatePosition<T, any_missing, any_cat>(ctx, gmat, column_matrix, nodes,
+                                                             p_tree);
+    });
   }
 
-  template <typename BinIdxType, bool any_missing, bool any_cat>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
   void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
                       const common::ColumnMatrix& column_matrix,
-                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+                      std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
     // 1. Find split condition for each split
     size_t n_nodes = nodes.size();
 
@@ -251,9 +247,9 @@ class CommonRowPartitioner {
     AddSplitsToRowSet(nodes, p_tree);
   }
 
-  auto const& Partitions() const { return row_set_collection_; }
+  [[nodiscard]] auto const& Partitions() const { return row_set_collection_; }
 
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
     return std::distance(row_set_collection_.begin(), row_set_collection_.end());
   }
 
@@ -266,12 +262,29 @@ class CommonRowPartitioner {
                                      [&](size_t idx) -> bool { return hess[idx] - .0f == .0f; });
   }
 
+  void LeafPartition(Context const* ctx, RegTree const& tree,
+                     linalg::TensorView<GradientPair const, 2> gpair,
+                     std::vector<bst_node_t>* p_out_position) const {
+    if (gpair.Shape(1) > 1) {
+      partition_builder_.LeafPartition(
+          ctx, tree, this->Partitions(), p_out_position, [&](std::size_t idx) -> bool {
+            auto sample = gpair.Slice(idx, linalg::All());
+            return std::all_of(linalg::cbegin(sample), linalg::cend(sample),
+                               [](GradientPair const& g) { return g.GetHess() - .0f == .0f; });
+          });
+    } else {
+      auto s = gpair.Slice(linalg::All(), 0);
+      partition_builder_.LeafPartition(
+          ctx, tree, this->Partitions(), p_out_position,
+          [&](std::size_t idx) -> bool { return s(idx).GetHess() - .0f == .0f; });
+    }
+  }
   void LeafPartition(Context const* ctx, RegTree const& tree,
                      common::Span<GradientPair const> gpair,
                      std::vector<bst_node_t>* p_out_position) const {
     partition_builder_.LeafPartition(
         ctx, tree, this->Partitions(), p_out_position,
-        [&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
+        [&](std::size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
   }
 
  private:
@@ -281,6 +294,5 @@ class CommonRowPartitioner {
   ColumnSplitHelper column_split_helper_;
 };
 
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
diff --git a/tests/cpp/common/test_partition_builder.cc b/tests/cpp/common/test_partition_builder.cc
index 4e6d800a7..08dd345f2 100644
--- a/tests/cpp/common/test_partition_builder.cc
+++ b/tests/cpp/common/test_partition_builder.cc
@@ -1,15 +1,17 @@
+/**
+ * Copyright 2020-2023 by XGBoost contributors
+ */
 #include <gtest/gtest.h>
-#include <vector>
+
 #include <string>
 #include <utility>
+#include <vector>
 
-#include "../../../src/common/row_set.h"
 #include "../../../src/common/partition_builder.h"
+#include "../../../src/common/row_set.h"
 #include "../helpers.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 TEST(PartitionBuilder, BasicTest) {
   constexpr size_t kBlockSize = 16;
   constexpr size_t kNodes = 5;
@@ -74,6 +76,4 @@ TEST(PartitionBuilder, BasicTest) {
     ASSERT_EQ(n_right, (kBlockSize - rows_for_left_node[nid]) * tasks[nid]);
   }
 }
-
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 308ae0823..6f2b83511 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -148,78 +148,5 @@ TEST(Approx, PartitionerColSplit) {
   RunWithInMemoryCommunicator(kWorkers, TestColumnSplitPartitioner, n_samples, base_rowid, Xy,
                               &hess, min_value, mid_value, mid_partitioner);
 }
-
-namespace {
-void TestLeafPartition(size_t n_samples) {
-  size_t const n_features = 2, base_rowid = 0;
-  Context ctx;
-  common::RowSetCollection row_set;
-  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
-
-  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  std::vector<CPUExpandEntry> candidates{{0, 0}};
-  candidates.front().split.loss_chg = 0.4;
-  RegTree tree;
-  std::vector<float> hess(n_samples, 0);
-  // emulate sampling
-  auto not_sampled = [](size_t i) {
-    size_t const kSampleFactor{3};
-    return i % kSampleFactor != 0;
-  };
-  for (size_t i = 0; i < hess.size(); ++i) {
-    if (not_sampled(i)) {
-      hess[i] = 1.0f;
-    }
-  }
-
-  std::vector<size_t> h_nptr;
-  float split_value{0};
-  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({Context::kCpuId, 64})) {
-    bst_feature_t const split_ind = 0;
-    auto ptr = page.cut.Ptrs()[split_ind + 1];
-    split_value = page.cut.Values().at(ptr / 2);
-    GetSplit(&tree, split_value, &candidates);
-    partitioner.UpdatePosition(&ctx, page, candidates, &tree);
-    std::vector<bst_node_t> position;
-    partitioner.LeafPartition(&ctx, tree, hess, &position);
-    std::sort(position.begin(), position.end());
-    size_t beg = std::distance(
-        position.begin(),
-        std::find_if(position.begin(), position.end(), [&](bst_node_t nidx) { return nidx >= 0; }));
-    std::vector<size_t> nptr;
-    common::RunLengthEncode(position.cbegin() + beg, position.cend(), &nptr);
-    std::transform(nptr.begin(), nptr.end(), nptr.begin(), [&](size_t x) { return x + beg; });
-    auto n_uniques = std::unique(position.begin() + beg, position.end()) - (position.begin() + beg);
-    ASSERT_EQ(nptr.size(), n_uniques + 1);
-    ASSERT_EQ(nptr[0], beg);
-    ASSERT_EQ(nptr.back(), n_samples);
-
-    h_nptr = nptr;
-  }
-
-  if (h_nptr.front() == n_samples) {
-    return;
-  }
-
-  ASSERT_GE(h_nptr.size(), 2);
-
-  for (auto const& page : Xy->GetBatches<SparsePage>()) {
-    auto batch = page.GetView();
-    size_t left{0};
-    for (size_t i = 0; i < batch.Size(); ++i) {
-      if (not_sampled(i) && batch[i].front().fvalue < split_value) {
-        left++;
-      }
-    }
-    ASSERT_EQ(left, h_nptr[1] - h_nptr[0]);  // equal to number of sampled assigned to left
-  }
-}
-}  // anonymous namespace
-
-TEST(Approx, LeafPartition) {
-  for (auto n_samples : {0ul, 1ul, 128ul, 256ul}) {
-    TestLeafPartition(n_samples);
-  }
-}
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_common_partitioner.cc b/tests/cpp/tree/test_common_partitioner.cc
new file mode 100644
index 000000000..7e47ec289
--- /dev/null
+++ b/tests/cpp/tree/test_common_partitioner.cc
@@ -0,0 +1,93 @@
+/**
+ * Copyright 2022-2023 by XGBoost contributors.
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>                         // for bst_node_t
+#include <xgboost/context.h>                      // for Context
+
+#include <algorithm>                              // for transform
+#include <iterator>                               // for distance
+#include <vector>                                 // for vector
+
+#include "../../../src/common/numeric.h"          // for ==RunLengthEncode
+#include "../../../src/common/row_set.h"          // for RowSetCollection
+#include "../../../src/data/gradient_index.h"     // for GHistIndexMatrix
+#include "../../../src/tree/common_row_partitioner.h"
+#include "../../../src/tree/hist/expand_entry.h"  // for CPUExpandEntry
+#include "../helpers.h"                           // for RandomDataGenerator
+#include "test_partitioner.h"                     // for GetSplit
+
+namespace xgboost::tree {
+namespace {
+void TestLeafPartition(size_t n_samples) {
+  size_t const n_features = 2, base_rowid = 0;
+  Context ctx;
+  common::RowSetCollection row_set;
+  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
+
+  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
+  RegTree tree;
+  std::vector<float> hess(n_samples, 0);
+  // emulate sampling
+  auto not_sampled = [](size_t i) {
+    size_t const kSampleFactor{3};
+    return i % kSampleFactor != 0;
+  };
+  for (size_t i = 0; i < hess.size(); ++i) {
+    if (not_sampled(i)) {
+      hess[i] = 1.0f;
+    }
+  }
+
+  std::vector<size_t> h_nptr;
+  float split_value{0};
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({Context::kCpuId, 64})) {
+    bst_feature_t const split_ind = 0;
+    auto ptr = page.cut.Ptrs()[split_ind + 1];
+    split_value = page.cut.Values().at(ptr / 2);
+    GetSplit(&tree, split_value, &candidates);
+    partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+    std::vector<bst_node_t> position;
+    partitioner.LeafPartition(&ctx, tree, hess, &position);
+    std::sort(position.begin(), position.end());
+    size_t beg = std::distance(
+        position.begin(),
+        std::find_if(position.begin(), position.end(), [&](bst_node_t nidx) { return nidx >= 0; }));
+    std::vector<size_t> nptr;
+    common::RunLengthEncode(position.cbegin() + beg, position.cend(), &nptr);
+    std::transform(nptr.begin(), nptr.end(), nptr.begin(), [&](size_t x) { return x + beg; });
+    auto n_uniques = std::unique(position.begin() + beg, position.end()) - (position.begin() + beg);
+    ASSERT_EQ(nptr.size(), n_uniques + 1);
+    ASSERT_EQ(nptr[0], beg);
+    ASSERT_EQ(nptr.back(), n_samples);
+
+    h_nptr = nptr;
+  }
+
+  if (h_nptr.front() == n_samples) {
+    return;
+  }
+
+  ASSERT_GE(h_nptr.size(), 2);
+
+  for (auto const& page : Xy->GetBatches<SparsePage>()) {
+    auto batch = page.GetView();
+    size_t left{0};
+    for (size_t i = 0; i < batch.Size(); ++i) {
+      if (not_sampled(i) && batch[i].front().fvalue < split_value) {
+        left++;
+      }
+    }
+    ASSERT_EQ(left, h_nptr[1] - h_nptr[0]);  // equal to number of sampled assigned to left
+  }
+}
+}  // anonymous namespace
+
+TEST(CommonRowPartitioner, LeafPartition) {
+  for (auto n_samples : {0ul, 1ul, 128ul, 256ul}) {
+    TestLeafPartition(n_samples);
+  }
+}
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_partitioner.h b/tests/cpp/tree/test_partitioner.h
index 093aa69eb..fbd98ddf9 100644
--- a/tests/cpp/tree/test_partitioner.h
+++ b/tests/cpp/tree/test_partitioner.h
@@ -1,17 +1,20 @@
-/*!
- * Copyright 2021-2022, XGBoost contributors.
+/**
+ * Copyright 2021-2023 by XGBoost contributors.
  */
 #ifndef XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_
 #define XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_
-#include <xgboost/tree_model.h>
+#include <xgboost/context.h>                      // for Context
+#include <xgboost/linalg.h>                       // for Constant, Vector
+#include <xgboost/logging.h>                      // for CHECK
+#include <xgboost/tree_model.h>                   // for RegTree
 
-#include <vector>
+#include <vector>                                 // for vector
 
-#include "../../../src/tree/hist/expand_entry.h"
+#include "../../../src/tree/hist/expand_entry.h"  // for CPUExpandEntry, MultiExpandEntry
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 inline void GetSplit(RegTree *tree, float split_value, std::vector<CPUExpandEntry> *candidates) {
+  CHECK(!tree->IsMultiTarget());
   tree->ExpandNode(
       /*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
       /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
@@ -21,6 +24,22 @@ inline void GetSplit(RegTree *tree, float split_value, std::vector<CPUExpandEntr
   candidates->front().split.sindex = 0;
   candidates->front().split.sindex |= (1U << 31);
 }
-}  // namespace tree
-}  // namespace xgboost
+
+inline void GetMultiSplitForTest(RegTree *tree, float split_value,
+                                 std::vector<MultiExpandEntry> *candidates) {
+  CHECK(tree->IsMultiTarget());
+  auto n_targets = tree->NumTargets();
+  Context ctx;
+  linalg::Vector<float> base_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
+  linalg::Vector<float> left_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
+  linalg::Vector<float> right_weight{linalg::Constant(&ctx, 0.0f, n_targets)};
+
+  tree->ExpandNode(/*nidx=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
+                   /*default_left=*/true, base_weight.HostView(), left_weight.HostView(),
+                   right_weight.HostView());
+  candidates->front().split.split_value = split_value;
+  candidates->front().split.sindex = 0;
+  candidates->front().split.sindex |= (1U << 31);
+}
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TESTS_CPP_TREE_TEST_PARTITIONER_H_
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 42edc2124..2aa1b8f47 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -1,25 +1,29 @@
-/*!
- * Copyright 2018-2022 by XGBoost Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/tree_updater.h>
 
 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <string>
 #include <vector>
 
+#include "../../../src/tree/common_row_partitioner.h"
+#include "../../../src/tree/hist/expand_entry.h"  // for MultiExpandEntry, CPUExpandEntry
 #include "../../../src/tree/param.h"
 #include "../../../src/tree/split_evaluator.h"
-#include "../../../src/tree/common_row_partitioner.h"
 #include "../helpers.h"
 #include "test_partitioner.h"
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace tree {
-TEST(QuantileHist, Partitioner) {
-  size_t n_samples = 1024, n_features = 1, base_rowid = 0;
+namespace xgboost::tree {
+template <typename ExpandEntry>
+void TestPartitioner(bst_target_t n_targets) {
+  std::size_t n_samples = 1024, base_rowid = 0;
+  bst_feature_t n_features = 1;
+
   Context ctx;
   ctx.InitAllowUnknown(Args{});
 
@@ -29,7 +33,7 @@ TEST(QuantileHist, Partitioner) {
   ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
 
   auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  std::vector<CPUExpandEntry> candidates{{0, 0}};
+  std::vector<ExpandEntry> candidates{{0, 0}};
   candidates.front().split.loss_chg = 0.4;
 
   auto cuts = common::SketchOnDMatrix(Xy.get(), 64, ctx.Threads());
@@ -41,9 +45,13 @@ TEST(QuantileHist, Partitioner) {
     column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
     {
       auto min_value = gmat.cut.MinValues()[split_ind];
-      RegTree tree;
+      RegTree tree{n_targets, n_features};
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
-      GetSplit(&tree, min_value, &candidates);
+      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+        GetSplit(&tree, min_value, &candidates);
+      } else {
+        GetMultiSplitForTest(&tree, min_value, &candidates);
+      }
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
       ASSERT_EQ(partitioner.Size(), 3);
       ASSERT_EQ(partitioner[1].Size(), 0);
@@ -53,9 +61,13 @@ TEST(QuantileHist, Partitioner) {
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
       auto ptr = gmat.cut.Ptrs()[split_ind + 1];
       float split_value = gmat.cut.Values().at(ptr / 2);
-      RegTree tree;
-      GetSplit(&tree, split_value, &candidates);
-      auto left_nidx = tree[RegTree::kRoot].LeftChild();
+      RegTree tree{n_targets, n_features};
+      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+        GetSplit(&tree, split_value, &candidates);
+      } else {
+        GetMultiSplitForTest(&tree, split_value, &candidates);
+      }
+      auto left_nidx = tree.LeftChild(RegTree::kRoot);
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
 
       auto elem = partitioner[left_nidx];
@@ -65,14 +77,17 @@ TEST(QuantileHist, Partitioner) {
         auto value = gmat.cut.Values().at(gmat.index[*it]);
         ASSERT_LE(value, split_value);
       }
-      auto right_nidx = tree[RegTree::kRoot].RightChild();
+      auto right_nidx = tree.RightChild(RegTree::kRoot);
       elem = partitioner[right_nidx];
       for (auto it = elem.begin; it != elem.end; ++it) {
         auto value = gmat.cut.Values().at(gmat.index[*it]);
-        ASSERT_GT(value, split_value) << *it;
+        ASSERT_GT(value, split_value);
       }
     }
   }
 }
-}  // namespace tree
-}  // namespace xgboost
+
+TEST(QuantileHist, Partitioner) { TestPartitioner<CPUExpandEntry>(1); }
+
+TEST(QuantileHist, MultiPartitioner) { TestPartitioner<MultiExpandEntry>(3); }
+}  // namespace xgboost::tree

From 55ed50c860b9ba5c8eea5ef35da92a352e5a4f0b Mon Sep 17 00:00:00 2001
From: Quentin Fiard <quentin.fiard@polytechnique.org>
Date: Thu, 16 Mar 2023 13:24:03 +0100
Subject: [PATCH 18/32] Fix a few typos in the C API tutorial (#8926)

---
 doc/tutorials/c_api_tutorial.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/tutorials/c_api_tutorial.rst b/doc/tutorials/c_api_tutorial.rst
index ca121e1d2..090743a0f 100644
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@@ -134,7 +134,7 @@ c. Assertion technique: It works both in C/ C++. If expression evaluates to 0 (f
       // do something with booster
 
       //free the memory
-      XGBoosterFree(booster)
+      XGBoosterFree(booster);
 
       DMatrixHandle DMatrixHandle_param;
 
@@ -156,7 +156,7 @@ c. Assertion technique: It works both in C/ C++. If expression evaluates to 0 (f
 .. code-block:: c
 
     BoosterHandle booster;
-    XGBoosterSetParam(booster, "paramter_name", "0.1");
+    XGBoosterSetParam(booster, "parameter_name", "0.1");
 
 
 **************************************************************

From 36263dd109f8a11010a7beba34328946ec2b09a9 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 16 Mar 2023 20:06:42 -0700
Subject: [PATCH 19/32] [jvm-packages] Use akka 2.6 (#8920)

---
 jvm-packages/xgboost4j-gpu/pom.xml            | 4 ++--
 jvm-packages/xgboost4j-tester/generate_pom.py | 4 ++--
 jvm-packages/xgboost4j/pom.xml                | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 4d35d2e76..1da88c3cc 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -41,13 +41,13 @@
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
             <scope>compile</scope>
         </dependency>
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
             <scope>test</scope>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py
index ff651a4f7..edc9759bd 100644
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@@ -51,13 +51,13 @@ pom_template = """
     <dependency>
       <groupId>com.typesafe.akka</groupId>
       <artifactId>akka-actor_${{scala.binary.version}}</artifactId>
-      <version>2.7.0</version>
+      <version>2.6.20</version>
       <scope>compile</scope>
     </dependency>
     <dependency>
       <groupId>com.typesafe.akka</groupId>
       <artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
-      <version>2.7.0</version>
+      <version>2.6.20</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index dcc4bf60c..946b11108 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -34,13 +34,13 @@
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
             <scope>compile</scope>
         </dependency>
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.7.0</version>
+            <version>2.6.20</version>
             <scope>test</scope>
         </dependency>
         <dependency>

From 9b6cc0ed07519040c677ce2f1ff64d170b0f428e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 17 Mar 2023 17:21:04 +0800
Subject: [PATCH 20/32] Refactor hist to prepare for multi-target builder.
 (#8928)

- Extract the builder from the updater class. We need a new builder for multi-target.
- Extract `UpdateTree`, it can be reused for different builders. Eventually, other tree
  updaters can use it as well.
---
 src/tree/updater_quantile_hist.cc | 544 ++++++++++++++++++------------
 src/tree/updater_quantile_hist.h  | 133 --------
 2 files changed, 319 insertions(+), 358 deletions(-)
 delete mode 100644 src/tree/updater_quantile_hist.h

diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 7d5f6efb3..7e5955dc8 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -4,263 +4,160 @@
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
  */
-#include "./updater_quantile_hist.h"
+#include <algorithm>                         // for max
+#include <cstddef>                           // for size_t
+#include <cstdint>                           // for uint32_t
+#include <memory>                            // for unique_ptr, allocator, make_unique, make_shared
+#include <ostream>                           // for operator<<, char_traits, basic_ostream
+#include <tuple>                             // for apply
+#include <utility>                           // for move, swap
+#include <vector>                            // for vector
 
-#include <algorithm>
-#include <cstddef>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
+#include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
+#include "../collective/communicator.h"      // for Operation
+#include "../common/hist_util.h"             // for HistogramCuts, HistCollection
+#include "../common/random.h"                // for ColumnSampler
+#include "../common/threading_utils.h"       // for ParallelFor
+#include "../common/timer.h"                 // for Monitor
+#include "../data/gradient_index.h"          // for GHistIndexMatrix
+#include "common_row_partitioner.h"          // for CommonRowPartitioner
+#include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
+#include "driver.h"                          // for Driver
+#include "hist/evaluate_splits.h"            // for HistEvaluator, UpdatePredictionCacheImpl
+#include "hist/expand_entry.h"               // for CPUExpandEntry
+#include "hist/histogram.h"                  // for HistogramBuilder, ConstructHistSpace
+#include "hist/sampler.h"                    // for SampleGradient
+#include "param.h"                           // for TrainParam, GradStats
+#include "xgboost/base.h"                    // for GradientPair, GradientPairInternal, bst_node_t
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for BatchIterator, BatchSet, DMatrix, MetaInfo
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/linalg.h"                  // for TensorView, MatrixView, UnravelIndex, All
+#include "xgboost/logging.h"                 // for LogCheck_EQ, LogCheck_GE, CHECK_EQ, LOG, LOG...
+#include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
+#include "xgboost/string_view.h"             // for operator<<
+#include "xgboost/task.h"                    // for ObjInfo
+#include "xgboost/tree_model.h"              // for RegTree, MTNotImplemented, RTreeNodeStat
+#include "xgboost/tree_updater.h"            // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...
 
-#include "common_row_partitioner.h"
-#include "constraints.h"
-#include "hist/evaluate_splits.h"
-#include "hist/histogram.h"
-#include "hist/sampler.h"
-#include "param.h"
-#include "xgboost/linalg.h"
-#include "xgboost/logging.h"
-#include "xgboost/tree_updater.h"
-
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 
 DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
 
-void QuantileHistMaker::Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair,
-                               DMatrix *dmat,
-                               common::Span<HostDeviceVector<bst_node_t>> out_position,
-                               const std::vector<RegTree *> &trees) {
-  // build tree
-  const size_t n_trees = trees.size();
-  if (!pimpl_) {
-    pimpl_.reset(new Builder(n_trees, param, dmat, *task_, ctx_));
-  }
+BatchParam HistBatch(TrainParam const *param) { return {param->max_bin, param->sparse_threshold}; }
 
-  size_t t_idx{0};
-  for (auto p_tree : trees) {
-    auto &t_row_position = out_position[t_idx];
-    this->pimpl_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
-    ++t_idx;
-  }
-}
-
-bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
-                                              linalg::VectorView<float> out_preds) {
-  if (pimpl_) {
-    return pimpl_->UpdatePredictionCache(data, out_preds);
-  } else {
-    return false;
-  }
-}
-
-CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
-    DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h) {
-  CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
-
-  size_t page_id = 0;
-  auto space = ConstructHistSpace(partitioner_, {node});
-  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-    std::vector<CPUExpandEntry> nodes_to_build{node};
-    std::vector<CPUExpandEntry> nodes_to_sub;
-    this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
-                                        partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                        nodes_to_sub, gpair_h);
-    ++page_id;
-  }
-
-  {
-    GradientPairPrecise grad_stat;
-    if (p_fmat->IsDense()) {
-      /**
-       * Specialized code for dense data: For dense data (with no missing value), the sum
-       * of gradient histogram is equal to snode[nid]
-       */
-      auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
-      std::vector<uint32_t> const &row_ptr = gmat.cut.Ptrs();
-      CHECK_GE(row_ptr.size(), 2);
-      uint32_t const ibegin = row_ptr[0];
-      uint32_t const iend = row_ptr[1];
-      auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
-      auto begin = hist.data();
-      for (uint32_t i = ibegin; i < iend; ++i) {
-        GradientPairPrecise const &et = begin[i];
-        grad_stat.Add(et.GetGrad(), et.GetHess());
-      }
-    } else {
-      for (auto const &grad : gpair_h) {
-        grad_stat.Add(grad.GetGrad(), grad.GetHess());
-      }
-      collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat), 2);
-    }
-
-    auto weight = evaluator_->InitRoot(GradStats{grad_stat});
-    p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
-    p_tree->Stat(RegTree::kRoot).base_weight = weight;
-    (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
-
-    std::vector<CPUExpandEntry> entries{node};
-    monitor_->Start("EvaluateSplits");
-    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
-    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-      evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree, &entries);
-      break;
-    }
-    monitor_->Stop("EvaluateSplits");
-    node = entries.front();
-  }
-
-  return node;
-}
-
-void QuantileHistMaker::Builder::BuildHistogram(DMatrix *p_fmat, RegTree *p_tree,
-                                                std::vector<CPUExpandEntry> const &valid_candidates,
-                                                std::vector<GradientPair> const &gpair) {
-  std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
-  std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
-
-  size_t n_idx = 0;
-  for (auto const &c : valid_candidates) {
-    auto left_nidx = (*p_tree)[c.nid].LeftChild();
-    auto right_nidx = (*p_tree)[c.nid].RightChild();
-    auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
-
-    auto build_nidx = left_nidx;
-    auto subtract_nidx = right_nidx;
-    if (fewer_right) {
-      std::swap(build_nidx, subtract_nidx);
-    }
-    nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
-    nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
-    n_idx++;
-  }
-
-  size_t page_id{0};
-  auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-    histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
-                                  partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                  nodes_to_sub, gpair);
-    ++page_id;
-  }
-}
-
-void QuantileHistMaker::Builder::LeafPartition(RegTree const &tree,
-                                               common::Span<GradientPair const> gpair,
-                                               std::vector<bst_node_t> *p_out_position) {
+template <typename ExpandEntry, typename Updater>
+void UpdateTree(common::Monitor *monitor_, linalg::MatrixView<GradientPair const> gpair,
+                Updater *updater, DMatrix *p_fmat, TrainParam const *param,
+                HostDeviceVector<bst_node_t> *p_out_position, RegTree *p_tree) {
   monitor_->Start(__func__);
-  if (!task_.UpdateTreeLeaf()) {
-    return;
-  }
-  for (auto const &part : partitioner_) {
-    part.LeafPartition(ctx_, tree, gpair, p_out_position);
-  }
-  monitor_->Stop(__func__);
-}
+  updater->InitData(p_fmat, p_tree);
 
-void QuantileHistMaker::Builder::ExpandTree(DMatrix *p_fmat, RegTree *p_tree,
-                                            const std::vector<GradientPair> &gpair_h,
-                                            HostDeviceVector<bst_node_t> *p_out_position) {
-  monitor_->Start(__func__);
-
-  Driver<CPUExpandEntry> driver(*param_);
-  driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
+  Driver<ExpandEntry> driver{*param};
   auto const &tree = *p_tree;
+  driver.Push(updater->InitRoot(p_fmat, gpair, p_tree));
   auto expand_set = driver.Pop();
 
+  /**
+   * Note for update position
+   * Root:
+   *   Not applied: No need to update position as initialization has got all the rows ordered.
+   *   Applied: Update position is run on applied nodes so the rows are partitioned.
+   * Non-root:
+   *   Not applied: That node is root of the subtree, same rule as root.
+   *   Applied: Ditto
+   */
   while (!expand_set.empty()) {
     // candidates that can be further splited.
-    std::vector<CPUExpandEntry> valid_candidates;
+    std::vector<ExpandEntry> valid_candidates;
     // candidaates that can be applied.
-    std::vector<CPUExpandEntry> applied;
-    int32_t depth = expand_set.front().depth + 1;
-    for (auto const& candidate : expand_set) {
-      evaluator_->ApplyTreeSplit(candidate, p_tree);
+    std::vector<ExpandEntry> applied;
+    for (auto const &candidate : expand_set) {
+      updater->ApplyTreeSplit(candidate, p_tree);
+      CHECK_GT(p_tree->LeftChild(candidate.nid), candidate.nid);
       applied.push_back(candidate);
       if (driver.IsChildValid(candidate)) {
         valid_candidates.emplace_back(candidate);
       }
     }
 
-    monitor_->Start("UpdatePosition");
-    size_t page_id{0};
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-      partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
-      ++page_id;
-    }
-    monitor_->Stop("UpdatePosition");
+    updater->UpdatePosition(p_fmat, p_tree, applied);
 
-    std::vector<CPUExpandEntry> best_splits;
+    std::vector<ExpandEntry> best_splits;
     if (!valid_candidates.empty()) {
-      this->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair_h);
+      updater->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair);
       for (auto const &candidate : valid_candidates) {
-        int left_child_nidx = tree[candidate.nid].LeftChild();
-        int right_child_nidx = tree[candidate.nid].RightChild();
-        CPUExpandEntry l_best{left_child_nidx, depth};
-        CPUExpandEntry r_best{right_child_nidx, depth};
+        auto left_child_nidx = tree.LeftChild(candidate.nid);
+        auto right_child_nidx = tree.RightChild(candidate.nid);
+        ExpandEntry l_best{left_child_nidx, tree.GetDepth(left_child_nidx)};
+        ExpandEntry r_best{right_child_nidx, tree.GetDepth(right_child_nidx)};
         best_splits.push_back(l_best);
         best_splits.push_back(r_best);
       }
-      auto const &histograms = histogram_builder_->Histogram();
-      auto ft = p_fmat->Info().feature_types.ConstHostSpan();
-      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-        evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, &best_splits);
-        break;
-      }
+      updater->EvaluateSplits(p_fmat, p_tree, &best_splits);
     }
     driver.Push(best_splits.begin(), best_splits.end());
     expand_set = driver.Pop();
   }
 
   auto &h_out_position = p_out_position->HostVector();
-  this->LeafPartition(tree, gpair_h, &h_out_position);
+  updater->LeafPartition(tree, gpair, &h_out_position);
   monitor_->Stop(__func__);
 }
 
-void QuantileHistMaker::Builder::UpdateTree(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
-                                            RegTree *p_tree,
-                                            HostDeviceVector<bst_node_t> *p_out_position) {
-  monitor_->Start(__func__);
+class HistBuilder {
+ private:
+  common::Monitor *monitor_;
+  TrainParam const *param_;
+  std::shared_ptr<common::ColumnSampler> col_sampler_;
+  std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
+  std::vector<CommonRowPartitioner> partitioner_;
 
-  std::vector<GradientPair> *gpair_ptr = &(gpair->HostVector());
-  // in case 'num_parallel_trees != 1' no posibility to change initial gpair
-  if (GetNumberOfTrees() != 1) {
-    gpair_local_.resize(gpair_ptr->size());
-    gpair_local_ = *gpair_ptr;
-    gpair_ptr = &gpair_local_;
+  // back pointers to tree and data matrix
+  const RegTree *p_last_tree_{nullptr};
+  DMatrix const *const p_last_fmat_{nullptr};
+
+  std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
+  ObjInfo const *task_{nullptr};
+  // Context for number of threads
+  Context const *ctx_{nullptr};
+
+ public:
+  explicit HistBuilder(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
+                       TrainParam const *param, DMatrix const *fmat, ObjInfo const *task,
+                       common::Monitor *monitor)
+      : monitor_{monitor},
+        param_{param},
+        col_sampler_{std::move(column_sampler)},
+        evaluator_{std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx, param, fmat->Info(),
+                                                                   col_sampler_)},
+        p_last_fmat_(fmat),
+        histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
+        task_{task},
+        ctx_{ctx} {
+    monitor_->Init(__func__);
   }
 
-  this->InitData(p_fmat, *p_tree, gpair_ptr);
-
-  ExpandTree(p_fmat, p_tree, *gpair_ptr, p_out_position);
-  monitor_->Stop(__func__);
-}
-
-bool QuantileHistMaker::Builder::UpdatePredictionCache(DMatrix const *data,
-                                                       linalg::VectorView<float> out_preds) const {
-  // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
-  // conjunction with Update().
-  if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
-    return false;
+  bool UpdatePredictionCache(DMatrix const *data, linalg::VectorView<float> out_preds) const {
+    // p_last_fmat_ is a valid pointer as long as UpdatePredictionCache() is called in
+    // conjunction with Update().
+    if (!p_last_fmat_ || !p_last_tree_ || data != p_last_fmat_) {
+      return false;
+    }
+    monitor_->Start(__func__);
+    CHECK_EQ(out_preds.Size(), data->Info().num_row_);
+    UpdatePredictionCacheImpl(ctx_, p_last_tree_, partitioner_, out_preds);
+    monitor_->Stop(__func__);
+    return true;
   }
-  monitor_->Start(__func__);
-  CHECK_EQ(out_preds.Size(), data->Info().num_row_);
-  UpdatePredictionCacheImpl(ctx_, p_last_tree_, partitioner_, out_preds);
-  monitor_->Stop(__func__);
-  return true;
-}
 
-size_t QuantileHistMaker::Builder::GetNumberOfTrees() { return n_trees_; }
+ public:
+  // initialize temp data structure
+  void InitData(DMatrix *fmat, RegTree const *p_tree) {
+    monitor_->Start(__func__);
 
-void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
-                                          std::vector<GradientPair> *gpair) {
-  monitor_->Start(__func__);
-  const auto& info = fmat->Info();
-
-  {
     size_t page_id{0};
-    int32_t n_total_bins{0};
+    bst_bin_t n_total_bins{0};
     partitioner_.clear();
     for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
       if (n_total_bins == 0) {
@@ -273,22 +170,219 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
     }
     histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
                               collective::IsDistributed(), fmat->IsColumnSplit());
-
-    auto m_gpair = linalg::MakeTensorView(ctx_, *gpair, gpair->size(), static_cast<std::size_t>(1));
-    SampleGradient(ctx_, *param_, m_gpair);
+    evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
+                                                                 col_sampler_);
+    p_last_tree_ = p_tree;
   }
 
-  // store a pointer to the tree
-  p_last_tree_ = &tree;
-  evaluator_.reset(new HistEvaluator<CPUExpandEntry>{ctx_, param_, info, column_sampler_});
+  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<CPUExpandEntry> *best_splits) {
+    monitor_->Start(__func__);
+    auto const &histograms = histogram_builder_->Histogram();
+    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
+      break;
+    }
+    monitor_->Stop(__func__);
+  }
 
-  monitor_->Stop(__func__);
-}
+  void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) {
+    this->evaluator_->ApplyTreeSplit(candidate, p_tree);
+  }
+
+  CPUExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
+                          RegTree *p_tree) {
+    CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
+
+    size_t page_id = 0;
+    auto space = ConstructHistSpace(partitioner_, {node});
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      std::vector<CPUExpandEntry> nodes_to_build{node};
+      std::vector<CPUExpandEntry> nodes_to_sub;
+      this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
+                                          partitioner_.at(page_id).Partitions(), nodes_to_build,
+                                          nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
+      ++page_id;
+    }
+
+    {
+      GradientPairPrecise grad_stat;
+      if (p_fmat->IsDense()) {
+        /**
+         * Specialized code for dense data: For dense data (with no missing value), the sum
+         * of gradient histogram is equal to snode[nid]
+         */
+        auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
+        std::vector<uint32_t> const &row_ptr = gmat.cut.Ptrs();
+        CHECK_GE(row_ptr.size(), 2);
+        uint32_t const ibegin = row_ptr[0];
+        uint32_t const iend = row_ptr[1];
+        auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
+        auto begin = hist.data();
+        for (uint32_t i = ibegin; i < iend; ++i) {
+          GradientPairPrecise const &et = begin[i];
+          grad_stat.Add(et.GetGrad(), et.GetHess());
+        }
+      } else {
+        auto gpair_h = gpair.Slice(linalg::All(), 0).Values();
+        for (auto const &grad : gpair_h) {
+          grad_stat.Add(grad.GetGrad(), grad.GetHess());
+        }
+        collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&grad_stat),
+                                                           2);
+      }
+
+      auto weight = evaluator_->InitRoot(GradStats{grad_stat});
+      p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
+      p_tree->Stat(RegTree::kRoot).base_weight = weight;
+      (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
+
+      std::vector<CPUExpandEntry> entries{node};
+      monitor_->Start("EvaluateSplits");
+      auto ft = p_fmat->Info().feature_types.ConstHostSpan();
+      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+        evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
+                                   &entries);
+        break;
+      }
+      monitor_->Stop("EvaluateSplits");
+      node = entries.front();
+    }
+
+    return node;
+  }
+
+  void BuildHistogram(DMatrix *p_fmat, RegTree *p_tree,
+                      std::vector<CPUExpandEntry> const &valid_candidates,
+                      linalg::MatrixView<GradientPair const> gpair) {
+    std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
+    std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
+
+    size_t n_idx = 0;
+    for (auto const &c : valid_candidates) {
+      auto left_nidx = (*p_tree)[c.nid].LeftChild();
+      auto right_nidx = (*p_tree)[c.nid].RightChild();
+      auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
+
+      auto build_nidx = left_nidx;
+      auto subtract_nidx = right_nidx;
+      if (fewer_right) {
+        std::swap(build_nidx, subtract_nidx);
+      }
+      nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
+      nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
+      n_idx++;
+    }
+
+    size_t page_id{0};
+    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
+                                    partitioner_.at(page_id).Partitions(), nodes_to_build,
+                                    nodes_to_sub, gpair.Values());
+      ++page_id;
+    }
+  }
+
+  void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<CPUExpandEntry> const &applied) {
+    monitor_->Start(__func__);
+    std::size_t page_id{0};
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
+      page_id++;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
+                     std::vector<bst_node_t> *p_out_position) {
+    monitor_->Start(__func__);
+    if (!task_->UpdateTreeLeaf()) {
+      return;
+    }
+    for (auto const &part : partitioner_) {
+      part.LeafPartition(ctx_, tree, gpair, p_out_position);
+    }
+    monitor_->Stop(__func__);
+  }
+};
+
+/*! \brief construct a tree using quantized feature values */
+class QuantileHistMaker : public TreeUpdater {
+  std::unique_ptr<HistBuilder> p_impl_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_ =
+      std::make_shared<common::ColumnSampler>();
+  common::Monitor monitor_;
+  ObjInfo const *task_;
+
+ public:
+  explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
+      : TreeUpdater{ctx}, task_{task} {}
+  void Configure(const Args &) override {}
+
+  void LoadConfig(Json const &) override {}
+  void SaveConfig(Json *) const override {}
+
+  [[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
+
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree *> &trees) override {
+    if (trees.front()->IsMultiTarget()) {
+      CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
+      LOG(FATAL) << "Not implemented.";
+    } else {
+      if (!p_impl_) {
+        p_impl_ =
+            std::make_unique<HistBuilder>(ctx_, column_sampler_, param, p_fmat, task_, &monitor_);
+      }
+    }
+
+    bst_target_t n_targets = trees.front()->NumTargets();
+    auto h_gpair =
+        linalg::MakeTensorView(ctx_, gpair->HostSpan(), p_fmat->Info().num_row_, n_targets);
+
+    linalg::Matrix<GradientPair> sample_out;
+    auto h_sample_out = h_gpair;
+    auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
+    if (need_copy()) {
+      // allocate buffer
+      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
+      h_sample_out = sample_out.HostView();
+    }
+
+    for (auto tree_it = trees.begin(); tree_it != trees.end(); ++tree_it) {
+      if (need_copy()) {
+        // Copy gradient into buffer for sampling.
+        std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
+      }
+      SampleGradient(ctx_, *param, h_sample_out);
+      auto *h_out_position = &out_position[tree_it - trees.begin()];
+      if ((*tree_it)->IsMultiTarget()) {
+        LOG(FATAL) << "Not implemented.";
+      } else {
+        UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
+                                   h_out_position, *tree_it);
+      }
+    }
+  }
+
+  bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
+    if (p_impl_) {
+      return p_impl_->UpdatePredictionCache(data, out_preds);
+    } else {
+      return false;
+    }
+  }
+
+  [[nodiscard]] bool HasNodePosition() const override { return true; }
+};
 
 XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
     .describe("Grow tree using quantized histogram.")
     .set_body([](Context const *ctx, ObjInfo const *task) {
       return new QuantileHistMaker(ctx, task);
     });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
deleted file mode 100644
index 138d5646a..000000000
--- a/src/tree/updater_quantile_hist.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
- * \file updater_quantile_hist.h
- * \brief use quantized feature values to construct a tree
- * \author Philip Cho, Tianqi Chen, Egor Smirnov
- */
-#ifndef XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
-#define XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
-
-#include <xgboost/tree_updater.h>
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/json.h"
-
-#include "hist/evaluate_splits.h"
-#include "hist/histogram.h"
-#include "hist/expand_entry.h"
-
-#include "common_row_partitioner.h"
-#include "constraints.h"
-#include "./param.h"
-#include "./driver.h"
-#include "../common/random.h"
-#include "../common/timer.h"
-#include "../common/hist_util.h"
-#include "../common/row_set.h"
-#include "../common/partition_builder.h"
-#include "../common/column_matrix.h"
-
-namespace xgboost::tree {
-inline BatchParam HistBatch(TrainParam const* param) {
-  return {param->max_bin, param->sparse_threshold};
-}
-
-/*! \brief construct a tree using quantized feature values */
-class QuantileHistMaker: public TreeUpdater {
- public:
-  explicit QuantileHistMaker(Context const* ctx, ObjInfo const* task)
-      : TreeUpdater(ctx), task_{task} {}
-  void Configure(const Args&) override {}
-
-  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
-              common::Span<HostDeviceVector<bst_node_t>> out_position,
-              const std::vector<RegTree*>& trees) override;
-
-  bool UpdatePredictionCache(const DMatrix *data,
-                             linalg::VectorView<float> out_preds) override;
-
-  void LoadConfig(Json const&) override {}
-  void SaveConfig(Json*) const override {}
-
-  [[nodiscard]] char const* Name() const override { return "grow_quantile_histmaker"; }
-  [[nodiscard]] bool HasNodePosition() const override { return true; }
-
- protected:
-  // actual builder that runs the algorithm
-  struct Builder {
-   public:
-    // constructor
-    explicit Builder(const size_t n_trees, TrainParam const* param, DMatrix const* fmat,
-                     ObjInfo task, Context const* ctx)
-        : n_trees_(n_trees),
-          param_(param),
-          p_last_fmat_(fmat),
-          histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
-          task_{task},
-          ctx_{ctx},
-          monitor_{std::make_unique<common::Monitor>()} {
-      monitor_->Init("Quantile::Builder");
-    }
-    // update one tree, growing
-    void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
-                    HostDeviceVector<bst_node_t>* p_out_position);
-
-    bool UpdatePredictionCache(DMatrix const* data, linalg::VectorView<float> out_preds) const;
-
-   private:
-    // initialize temp data structure
-    void InitData(DMatrix* fmat, const RegTree& tree, std::vector<GradientPair>* gpair);
-
-    size_t GetNumberOfTrees();
-
-    CPUExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree,
-                            const std::vector<GradientPair>& gpair_h);
-
-    void BuildHistogram(DMatrix* p_fmat, RegTree* p_tree,
-                        std::vector<CPUExpandEntry> const& valid_candidates,
-                        std::vector<GradientPair> const& gpair);
-
-    void LeafPartition(RegTree const& tree, common::Span<GradientPair const> gpair,
-                       std::vector<bst_node_t>* p_out_position);
-
-    void ExpandTree(DMatrix* p_fmat, RegTree* p_tree, const std::vector<GradientPair>& gpair_h,
-                    HostDeviceVector<bst_node_t>* p_out_position);
-
-   private:
-    const size_t n_trees_;
-    TrainParam const* param_;
-    std::shared_ptr<common::ColumnSampler> column_sampler_{
-        std::make_shared<common::ColumnSampler>()};
-
-    std::vector<GradientPair> gpair_local_;
-
-    std::unique_ptr<HistEvaluator<CPUExpandEntry>> evaluator_;
-    std::vector<CommonRowPartitioner> partitioner_;
-
-    // back pointers to tree and data matrix
-    const RegTree* p_last_tree_{nullptr};
-    DMatrix const* const p_last_fmat_;
-
-    std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
-    ObjInfo task_;
-    // Context for number of threads
-    Context const* ctx_;
-
-    std::unique_ptr<common::Monitor> monitor_;
-  };
-
- protected:
-  std::unique_ptr<Builder> pimpl_;
-  ObjInfo const* task_;
-};
-}  // namespace xgboost::tree
-
-#endif  // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_

From 34092d7fd061d45462cae8b5d97c5d74f10552f2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 21 Mar 2023 15:34:43 +0800
Subject: [PATCH 21/32] Bump maven-release-plugin in
 /jvm-packages/xgboost4j-spark (#8952)

Bumps [maven-release-plugin](https://github.com/apache/maven-release) from 2.5.3 to 3.0.0.
- [Release notes](https://github.com/apache/maven-release/releases)
- [Commits](https://github.com/apache/maven-release/compare/maven-release-2.5.3...maven-release-3.0.0)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-release-plugin
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index b97eccc01..e662b762a 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -118,7 +118,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-release-plugin</artifactId>
-                        <version>2.5.3</version>
+                        <version>3.0.0</version>
                         <configuration>
                             <autoVersionSubmodules>true</autoVersionSubmodules>
                             <useReleaseProfile>false</useReleaseProfile>

From 8dc1e4b3ea6f1b440c3806c04f18157e23fbd7a9 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Mar 2023 09:22:11 -0700
Subject: [PATCH 22/32] Improve doxygen (#8959)

* Remove Sphinx build from GH Action

* Build Doxygen as part of RTD build

* Add jQuery
---
 .github/workflows/main.yml |  37 -------
 doc/c++.rst                |   2 +-
 doc/c.rst                  |   2 +-
 doc/conf.py                | 191 ++++++++++++++++++++++---------------
 4 files changed, 114 insertions(+), 118 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ac50b744b..ab2a58fe9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -156,40 +156,3 @@ jobs:
             xgboost \
             cpp \
             include src python-package
-
-  sphinx:
-    runs-on: ubuntu-latest
-    name: Build docs using Sphinx
-    steps:
-    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
-      with:
-        submodules: 'true'
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
-      with:
-        python-version: "3.8"
-        architecture: 'x64'
-    - name: Install system packages
-      run: |
-        sudo apt-get install -y --no-install-recommends graphviz doxygen ninja-build
-        python -m pip install wheel setuptools awscli
-        python -m pip install -r doc/requirements.txt
-    - name: Extract branch name
-      shell: bash
-      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
-      id: extract_branch
-      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
-    - name: Run Sphinx
-      run: |
-        make -C doc html
-      env:
-        SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}
-        READTHEDOCS: "True"
-
-    - name: Publish
-      run: |
-        tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doxygen/doc_doxygen/
-        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/doxygen/ --acl public-read
-      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
-      env:
-        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
-        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
diff --git a/doc/c++.rst b/doc/c++.rst
index 4a045fc42..ce30bbefa 100644
--- a/doc/c++.rst
+++ b/doc/c++.rst
@@ -8,5 +8,5 @@ As a result it's changing quite often and we don't maintain its stability.  Alon
 plugin system (see ``plugin/example`` in XGBoost's source tree), users can utilize some
 existing c++ headers for gaining more access to the internal of XGBoost.
 
-* `C++ interface documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/files.html>`_
+* `C++ interface documentation (latest master branch) <./dev/files.html>`_
 * `C++ interface documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/files.html>`_
diff --git a/doc/c.rst b/doc/c.rst
index 02581b874..d63e779e1 100644
--- a/doc/c.rst
+++ b/doc/c.rst
@@ -10,7 +10,7 @@ simply look at function comments in ``include/xgboost/c_api.h``. The reference i
 to sphinx with the help of breathe, which doesn't contain links to examples but might be
 easier to read. For the original doxygen pages please visit:
 
-* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
+* `C API documentation (latest master branch) <./dev/c__api_8h.html>`_
 * `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_
 
 ***************
diff --git a/doc/conf.py b/doc/conf.py
index 7d585e420..73fe48acc 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -13,53 +13,106 @@
 # serve to show the default.
 import os
 import re
+import shutil
 import subprocess
 import sys
+import tarfile
 import urllib.request
+import warnings
 from subprocess import call
 from urllib.error import HTTPError
 
 from sh.contrib import git
 
-git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None)
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
+TMP_DIR = os.path.join(CURR_PATH, "tmp")
+DOX_DIR = "doxygen"
+
+
+def run_doxygen():
+    """Run the doxygen make command in the designated folder."""
+    curdir = os.path.normpath(os.path.abspath(os.path.curdir))
+    if os.path.exists(TMP_DIR):
+        print(f"Delete directory {TMP_DIR}")
+        shutil.rmtree(TMP_DIR)
+    else:
+        print(f"Create directory {TMP_DIR}")
+        os.mkdir(TMP_DIR)
+    try:
+        os.chdir(PROJECT_ROOT)
+        if not os.path.exists(DOX_DIR):
+            os.mkdir(DOX_DIR)
+        os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
+        print(
+            "Build doxygen at {}".format(
+                os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen")
+            )
+        )
+        subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
+        subprocess.check_call(["ninja", "doc_doxygen"])
+
+        src = os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen", "html")
+        dest = os.path.join(TMP_DIR, "dev")
+        print(f"Copy directory {src} -> {dest}")
+        shutil.copytree(src, dest)
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: %s" % e)
+    finally:
+        os.chdir(curdir)
+
+
+def is_readthedocs_build():
+    if os.environ.get("READTHEDOCS", None) == "True":
+        return True
+    warnings.warn(
+        "Skipping Doxygen build... You won't have documentation for C/C++ functions. "
+        "Set environment variable READTHEDOCS=True if you want to build Doxygen. "
+        "(If you do opt in, make sure to install Doxygen, Graphviz, CMake, and C++ compiler "
+        "on your system.)"
+    )
+    return False
+
+
+if is_readthedocs_build():
+    run_doxygen()
+
+
+git_branch = os.getenv("SPHINX_GIT_BRANCH", default=None)
 if not git_branch:
     # If SPHINX_GIT_BRANCH environment variable is not given, run git
     # to determine branch name
     git_branch = [
-        re.sub(r'origin/', '', x.lstrip(' ')) for x in str(
-            git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')
+        re.sub(r"origin/", "", x.lstrip(" "))
+        for x in str(git.branch("-r", "--contains", "HEAD")).rstrip("\n").split("\n")
     ]
-    git_branch = [x for x in git_branch if 'HEAD' not in x]
+    git_branch = [x for x in git_branch if "HEAD" not in x]
 else:
     git_branch = [git_branch]
-print('git_branch = {}'.format(git_branch[0]))
+print("git_branch = {}".format(git_branch[0]))
 
 try:
     filename, _ = urllib.request.urlretrieve(
-        'https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(
-            git_branch[0]))
-    call(
-        'if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'
-        .format(filename),
-        shell=True)
+        f"https://s3-us-west-2.amazonaws.com/xgboost-docs/{git_branch[0]}.tar.bz2"
+    )
+    if not os.path.exists(TMP_DIR):
+        print(f"Create directory {TMP_DIR}")
+        os.mkdir(TMP_DIR)
+    jvm_doc_dir = os.path.join(TMP_DIR, "jvm")
+    if os.path.exists(jvm_doc_dir):
+        print(f"Delete directory {jvm_doc_dir}")
+        shutil.rmtree(jvm_doc_dir)
+    print(f"Create directory {jvm_doc_dir}")
+    os.mkdir(jvm_doc_dir)
+
+    with tarfile.open(filename, "r:bz2") as t:
+        t.extractall(jvm_doc_dir)
 except HTTPError:
-    print('JVM doc not found. Skipping...')
-try:
-    filename, _ = urllib.request.urlretrieve(
-        'https://s3-us-west-2.amazonaws.com/xgboost-docs/doxygen/{}.tar.bz2'.
-        format(git_branch[0]))
-    call(
-        'mkdir -p tmp/dev; cd tmp/dev; tar xvf {}; mv doc_doxygen/html/* .; rm -rf doc_doxygen'
-        .format(filename),
-        shell=True)
-except HTTPError:
-    print('C API doc not found. Skipping...')
+    print("JVM doc not found. Skipping...")
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
 libpath = os.path.join(PROJECT_ROOT, "python-package/")
 sys.path.insert(0, libpath)
 sys.path.insert(0, CURR_PATH)
@@ -82,50 +135,56 @@ release = xgboost.__version__
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
 extensions = [
-    'matplotlib.sphinxext.plot_directive',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.intersphinx',
+    "matplotlib.sphinxext.plot_directive",
+    "sphinxcontrib.jquery",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.intersphinx",
     "sphinx_gallery.gen_gallery",
-    'breathe',
-    'recommonmark'
+    "breathe",
+    "recommonmark",
 ]
 
 sphinx_gallery_conf = {
     # path to your example scripts
     "examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
     # path to where to save gallery generated output
-    "gallery_dirs": ["python/examples", "python/dask-examples", "python/survival-examples"],
+    "gallery_dirs": [
+        "python/examples",
+        "python/dask-examples",
+        "python/survival-examples",
+    ],
     "matplotlib_animations": True,
 }
 
 autodoc_typehints = "description"
 
-graphviz_output_format = 'png'
-plot_formats = [('svg', 300), ('png', 100), ('hires.png', 300)]
+graphviz_output_format = "png"
+plot_formats = [("svg", 300), ("png", 100), ("hires.png", 300)]
 plot_html_show_source_link = False
 plot_html_show_formats = False
 
 # Breathe extension variables
-DOX_DIR = "doxygen"
-breathe_projects = {
-    "xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
-}
+breathe_projects = {}
+if is_readthedocs_build():
+    breathe_projects = {
+        "xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
+    }
 breathe_default_project = "xgboost"
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 
 # The encoding of source files.
 # source_encoding = 'utf-8-sig'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -134,7 +193,7 @@ master_doc = 'index'
 # Usually you set "language" from the command line for these cases.
 language = "en"
 
-autoclass_content = 'both'
+autoclass_content = "both"
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
@@ -144,8 +203,10 @@ autoclass_content = 'both'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-html_extra_path = ['./tmp']
+exclude_patterns = ["_build"]
+html_extra_path = []
+if is_readthedocs_build():
+    html_extra_path = [TMP_DIR]
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@@ -163,7 +224,7 @@ html_extra_path = ['./tmp']
 # show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # A list of ignored prefixes for module index sorting.
 # modindex_common_prefix = []
@@ -186,27 +247,24 @@ html_logo = "https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/lo
 
 html_css_files = ["css/custom.css"]
 
-html_sidebars = {
-  '**': ['logo-text.html', 'globaltoc.html', 'searchbox.html']
-}
+html_sidebars = {"**": ["logo-text.html", "globaltoc.html", "searchbox.html"]}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = project + 'doc'
+htmlhelp_basename = project + "doc"
 
 # -- Options for LaTeX output ---------------------------------------------
-latex_elements = {
-}
+latex_elements = {}
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  (master_doc, '%s.tex' % project, project, author, 'manual'),
+    (master_doc, "%s.tex" % project, project, author, "manual"),
 ]
 
 intersphinx_mapping = {
@@ -221,30 +279,5 @@ intersphinx_mapping = {
 }
 
 
-# hook for doxygen
-def run_doxygen():
-    """Run the doxygen make command in the designated folder."""
-    curdir = os.path.normpath(os.path.abspath(os.path.curdir))
-    try:
-        os.chdir(PROJECT_ROOT)
-        if not os.path.exists(DOX_DIR):
-            os.mkdir(DOX_DIR)
-        os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
-        subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
-        subprocess.check_call(["ninja", "doc_doxygen"])
-    except OSError as e:
-        sys.stderr.write("doxygen execution failed: %s" % e)
-    finally:
-        os.chdir(curdir)
-
-
-def generate_doxygen_xml(app):
-    """Run the doxygen make commands if we're on the ReadTheDocs server"""
-    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
-    if read_the_docs_build:
-        run_doxygen()
-
-
 def setup(app):
-    app.add_css_file('custom.css')
-    app.connect("builder-inited", generate_doxygen_xml)
+    app.add_css_file("custom.css")

From b240f055d362d149057e181e0e14243d8123911b Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 21 Mar 2023 23:25:26 -0700
Subject: [PATCH 23/32] Support vertical federated learning (#8932)

---
 include/xgboost/data.h                        | 26 +++++--
 src/data/data.cc                              | 57 ++++++++++-----
 src/data/data.cu                              |  8 +--
 src/data/iterative_dmatrix.cc                 |  2 +-
 src/data/iterative_dmatrix.cu                 |  2 +-
 src/data/simple_dmatrix.cc                    | 56 +++++++++++----
 src/data/simple_dmatrix.cu                    | 12 ++--
 src/data/simple_dmatrix.h                     | 12 +++-
 src/data/sparse_page_dmatrix.cc               |  2 +-
 src/learner.cc                                | 45 +++++++++++-
 src/objective/init_estimation.cc              |  2 +-
 src/tree/fit_stump.cc                         | 15 ++--
 src/tree/fit_stump.h                          |  3 +-
 tests/cpp/data/test_data.cc                   | 27 ++------
 tests/cpp/helpers.cc                          | 23 +++++++
 tests/cpp/helpers.h                           |  2 +
 tests/cpp/plugin/helpers.cc                   | 19 -----
 tests/cpp/plugin/helpers.h                    | 69 +++++++++++++++++--
 tests/cpp/plugin/test_federated_adapter.cu    | 65 ++++-------------
 .../cpp/plugin/test_federated_communicator.cc | 60 +++-------------
 tests/cpp/plugin/test_federated_data.cc       | 65 +++++++++++++++++
 tests/cpp/plugin/test_federated_server.cc     | 45 ++----------
 tests/cpp/tree/test_fit_stump.cc              |  3 +-
 23 files changed, 371 insertions(+), 249 deletions(-)
 delete mode 100644 tests/cpp/plugin/helpers.cc
 create mode 100644 tests/cpp/plugin/test_federated_data.cc

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index ec78c588d..57f8a0e36 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -171,6 +171,15 @@ class MetaInfo {
    */
   void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column);
 
+  /**
+   * @brief Synchronize the number of columns across all workers.
+   *
+   * Normally we just need to find the maximum number of columns across all workers, but
+   * in vertical federated learning, since each worker loads its own list of columns,
+   * we need to sum them.
+   */
+  void SynchronizeNumberOfColumns();
+
  private:
   void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
   void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
@@ -325,6 +334,10 @@ class SparsePage {
    * \brief Check wether the column index is sorted.
    */
   bool IsIndicesSorted(int32_t n_threads) const;
+  /**
+   * \brief Reindex the column index with an offset.
+   */
+  void Reindex(uint64_t feature_offset, int32_t n_threads);
 
   void SortRows(int32_t n_threads);
 
@@ -559,17 +572,18 @@ class DMatrix {
    * \brief Creates a new DMatrix from an external data adapter.
    *
    * \tparam  AdapterT  Type of the adapter.
-   * \param [in,out]  adapter       View onto an external data.
-   * \param           missing       Values to count as missing.
-   * \param           nthread       Number of threads for construction.
-   * \param           cache_prefix  (Optional) The cache prefix for external memory.
-   * \param           page_size     (Optional) Size of the page.
+   * \param [in,out]  adapter         View onto an external data.
+   * \param           missing         Values to count as missing.
+   * \param           nthread         Number of threads for construction.
+   * \param           cache_prefix    (Optional) The cache prefix for external memory.
+   * \param           data_split_mode (Optional) Data split mode.
    *
    * \return  a Created DMatrix.
    */
   template <typename AdapterT>
   static DMatrix* Create(AdapterT* adapter, float missing, int nthread,
-                         const std::string& cache_prefix = "");
+                         const std::string& cache_prefix = "",
+                         DataSplitMode data_split_mode = DataSplitMode::kRow);
 
   /**
    * \brief Create a new Quantile based DMatrix used for histogram based algorithm.
diff --git a/src/data/data.cc b/src/data/data.cc
index aa96a1bc8..6f5d52817 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -703,6 +703,14 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   }
 }
 
+void MetaInfo::SynchronizeNumberOfColumns() {
+  if (collective::IsFederated() && data_split_mode == DataSplitMode::kCol) {
+    collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
+  } else {
+    collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
+  }
+}
+
 void MetaInfo::Validate(std::int32_t device) const {
   if (group_ptr_.size() != 0 && weights_.Size() != 0) {
     CHECK_EQ(group_ptr_.size(), weights_.Size() + 1)
@@ -870,7 +878,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
           dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
       data::FileAdapter adapter(parser.get());
       dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
-                             cache_file);
+                             cache_file, data_split_mode);
     } else {
       data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart),
                               file_format};
@@ -906,11 +914,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     LOG(FATAL) << "Encountered parser error:\n" << e.what();
   }
 
-  /* sync up number of features after matrix loaded.
-   * partitioned data will fail the train/val validation check
-   * since partitioned data not knowing the real number of features. */
-  collective::Allreduce<collective::Operation::kMax>(&dmat->Info().num_col_, 1);
-
   if (need_split && data_split_mode == DataSplitMode::kCol) {
     if (!cache_file.empty()) {
       LOG(FATAL) << "Column-wise data split is not support for external memory.";
@@ -920,7 +923,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     delete dmat;
     return sliced;
   } else {
-    dmat->Info().data_split_mode = data_split_mode;
     return dmat;
   }
 }
@@ -957,39 +959,49 @@ template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
     XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string);
 
 template <typename AdapterT>
-DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&) {
-  return new data::SimpleDMatrix(adapter, missing, nthread);
+DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&,
+                         DataSplitMode data_split_mode) {
+  return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
 }
 
 template DMatrix* DMatrix::Create<data::DenseAdapter>(data::DenseAdapter* adapter, float missing,
                                                       std::int32_t nthread,
-                                                      const std::string& cache_prefix);
+                                                      const std::string& cache_prefix,
+                                                      DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::ArrayAdapter>(data::ArrayAdapter* adapter, float missing,
                                                       std::int32_t nthread,
-                                                      const std::string& cache_prefix);
+                                                      const std::string& cache_prefix,
+                                                      DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSRAdapter>(data::CSRAdapter* adapter, float missing,
                                                     std::int32_t nthread,
-                                                    const std::string& cache_prefix);
+                                                    const std::string& cache_prefix,
+                                                    DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSCAdapter>(data::CSCAdapter* adapter, float missing,
                                                     std::int32_t nthread,
-                                                    const std::string& cache_prefix);
+                                                    const std::string& cache_prefix,
+                                                    DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::DataTableAdapter>(data::DataTableAdapter* adapter,
                                                           float missing, std::int32_t nthread,
-                                                          const std::string& cache_prefix);
+                                                          const std::string& cache_prefix,
+                                                          DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::FileAdapter>(data::FileAdapter* adapter, float missing,
                                                      std::int32_t nthread,
-                                                     const std::string& cache_prefix);
+                                                     const std::string& cache_prefix,
+                                                     DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(data::CSRArrayAdapter* adapter,
                                                          float missing, std::int32_t nthread,
-                                                         const std::string& cache_prefix);
+                                                         const std::string& cache_prefix,
+                                                         DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter* adapter,
                                                          float missing, std::int32_t nthread,
-                                                         const std::string& cache_prefix);
+                                                         const std::string& cache_prefix,
+                                                         DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create(
     data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
-    float missing, int nthread, const std::string& cache_prefix);
+    float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
-    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&);
+    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
+    DataSplitMode data_split_mode);
 
 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
   SparsePage transpose;
@@ -1051,6 +1063,13 @@ void SparsePage::SortIndices(int32_t n_threads) {
   });
 }
 
+void SparsePage::Reindex(uint64_t feature_offset, int32_t n_threads) {
+  auto& h_data = this->data.HostVector();
+  common::ParallelFor(h_data.size(), n_threads, [&](auto i) {
+    h_data[i].index += feature_offset;
+  });
+}
+
 void SparsePage::SortRows(int32_t n_threads) {
   auto& h_offset = this->offset.HostVector();
   auto& h_data = this->data.HostVector();
diff --git a/src/data/data.cu b/src/data/data.cu
index 4dedc7d24..eccbe7567 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -170,17 +170,17 @@ void MetaInfo::SetInfoFromCUDA(Context const& ctx, StringView key, Json array) {
 
 template <typename AdapterT>
 DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
-                         const std::string& cache_prefix) {
+                         const std::string& cache_prefix, DataSplitMode data_split_mode) {
   CHECK_EQ(cache_prefix.size(), 0)
       << "Device memory construction is not currently supported with external "
          "memory.";
-  return new data::SimpleDMatrix(adapter, missing, nthread);
+  return new data::SimpleDMatrix(adapter, missing, nthread, data_split_mode);
 }
 
 template DMatrix* DMatrix::Create<data::CudfAdapter>(
     data::CudfAdapter* adapter, float missing, int nthread,
-    const std::string& cache_prefix);
+    const std::string& cache_prefix, DataSplitMode data_split_mode);
 template DMatrix* DMatrix::Create<data::CupyAdapter>(
     data::CupyAdapter* adapter, float missing, int nthread,
-    const std::string& cache_prefix);
+    const std::string& cache_prefix, DataSplitMode data_split_mode);
 }  // namespace xgboost
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index ae0cfc4a4..c7ac492c9 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -190,7 +190,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   // From here on Info() has the correct data shape
   Info().num_row_ = accumulated_rows;
   Info().num_nonzero_ = nnz;
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  Info().SynchronizeNumberOfColumns();
   CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
     return f > accumulated_rows;
   })) << "Something went wrong during iteration.";
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 2d4a0bb0b..5e7fc8d4f 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -166,7 +166,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
 
   iter.Reset();
   // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.SynchronizeNumberOfColumns();
 }
 
 BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& param) {
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 014b57282..098c3c4f2 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -73,6 +73,19 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
   return out;
 }
 
+void SimpleDMatrix::ReindexFeatures() {
+  if (collective::IsFederated() && info_.data_split_mode == DataSplitMode::kCol) {
+    std::vector<uint64_t> buffer(collective::GetWorldSize());
+    buffer[collective::GetRank()] = info_.num_col_;
+    collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
+    auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
+    if (offset == 0) {
+      return;
+    }
+    sparse_page_->Reindex(offset, ctx_.Threads());
+  }
+}
+
 BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
   // since csr is the default data structure so `source_` is always available.
   auto begin_iter = BatchIterator<SparsePage>(
@@ -151,7 +164,8 @@ BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(BatchParam const&) {
 }
 
 template <typename AdapterT>
-SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
+SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
+                             DataSplitMode data_split_mode) {
   this->ctx_.nthread = nthread;
 
   std::vector<uint64_t> qids;
@@ -217,7 +231,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
 
 
   // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
+  ReindexFeatures();
+  info_.SynchronizeNumberOfColumns();
 
   if (adapter->NumRows() == kAdapterUnknownSize) {
     using IteratorAdapterT
@@ -272,22 +288,31 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
     fo->Write(sparse_page_->data.HostVector());
 }
 
-template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread);
-template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread);
+template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
+template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
+                                      DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(
     IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>
         *adapter,
-    float missing, int nthread);
+    float missing, int nthread, DataSplitMode data_split_mode);
 
 template <>
-SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread) {
-  ctx_.nthread = nthread;
+SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
+                             DataSplitMode data_split_mode) {
+    ctx_.nthread = nthread;
 
   auto& offset_vec = sparse_page_->offset.HostVector();
   auto& data_vec = sparse_page_->data.HostVector();
@@ -346,7 +371,10 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
   }
   // Synchronise worker columns
   info_.num_col_ = adapter->NumColumns();
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
+  ReindexFeatures();
+  info_.SynchronizeNumberOfColumns();
+
   info_.num_row_ = total_batch_size;
   info_.num_nonzero_ = data_vec.size();
   CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index 64f308b8c..fc09f52c4 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -15,7 +15,10 @@ namespace data {
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
 template <typename AdapterT>
-SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/) {
+SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/,
+                             DataSplitMode data_split_mode) {
+  CHECK(data_split_mode != DataSplitMode::kCol)
+      << "Column-wise data split is currently not supported on the GPU.";
   auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
                                                                       : adapter->DeviceIdx();
   CHECK_GE(device, 0);
@@ -35,12 +38,13 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
   info_.num_col_ = adapter->NumColumns();
   info_.num_row_ = adapter->NumRows();
   // Synchronise worker columns
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.data_split_mode = data_split_mode;
+  info_.SynchronizeNumberOfColumns();
 }
 
 template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
-                                      int nthread);
+                                      int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
-                                      int nthread);
+                                      int nthread, DataSplitMode data_split_mode);
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
index 897abfcf0..853e765af 100644
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -22,7 +22,8 @@ class SimpleDMatrix : public DMatrix {
  public:
   SimpleDMatrix() = default;
   template <typename AdapterT>
-  explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread);
+  explicit SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
+                         DataSplitMode data_split_mode = DataSplitMode::kRow);
 
   explicit SimpleDMatrix(dmlc::Stream* in_stream);
   ~SimpleDMatrix() override = default;
@@ -61,6 +62,15 @@ class SimpleDMatrix : public DMatrix {
   bool GHistIndexExists() const override { return static_cast<bool>(gradient_index_); }
   bool SparsePageExists() const override { return true; }
 
+  /**
+   * \brief Reindex the features based on a global view.
+   *
+   * In some cases (e.g. vertical federated learning), features are loaded locally with indices
+   * starting from 0. However, all the algorithms assume the features are globally indexed, so we
+   * reindex the features based on the offset needed to obtain the global view.
+   */
+  void ReindexFeatures();
+
  private:
   Context ctx_;
 };
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 698e1e5b2..5e5b622af 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -96,7 +96,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
   this->info_.num_col_ = n_features;
   this->info_.num_nonzero_ = nnz;
 
-  collective::Allreduce<collective::Operation::kMax>(&info_.num_col_, 1);
+  info_.SynchronizeNumberOfColumns();
   CHECK_NE(info_.num_col_, 0);
 }
 
diff --git a/src/learner.cc b/src/learner.cc
index e1b5605ca..14f57a5ba 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -440,7 +440,7 @@ class LearnerConfiguration : public Learner {
         info.Validate(Ctx()->gpu_id);
         // We estimate it from input data.
         linalg::Tensor<float, 1> base_score;
-        UsePtr(obj_)->InitEstimation(info, &base_score);
+        InitEstimation(info, &base_score);
         CHECK_EQ(base_score.Size(), 1);
         mparam_.base_score = base_score(0);
         CHECK(!std::isnan(mparam_.base_score));
@@ -857,6 +857,25 @@ class LearnerConfiguration : public Learner {
       mparam_.num_target = n_targets;
     }
   }
+
+  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
+    // Special handling for vertical federated learning.
+    if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) {
+      // We assume labels are only available on worker 0, so the estimation is calculated there
+      // and added to other workers.
+      if (collective::GetRank() == 0) {
+        UsePtr(obj_)->InitEstimation(info, base_score);
+        collective::Broadcast(base_score->Data()->HostPointer(),
+                              sizeof(bst_float) * base_score->Size(), 0);
+      } else {
+        base_score->Reshape(1);
+        collective::Broadcast(base_score->Data()->HostPointer(),
+                              sizeof(bst_float) * base_score->Size(), 0);
+      }
+    } else {
+      UsePtr(obj_)->InitEstimation(info, base_score);
+    }
+  }
 };
 
 std::string const LearnerConfiguration::kEvalMetric {"eval_metric"};  // NOLINT
@@ -1307,7 +1326,7 @@ class LearnerImpl : public LearnerIO {
     monitor_.Stop("PredictRaw");
 
     monitor_.Start("GetGradient");
-    obj_->GetGradient(predt.predictions, train->Info(), iter, &gpair_);
+    GetGradient(predt.predictions, train->Info(), iter, &gpair_);
     monitor_.Stop("GetGradient");
     TrainingObserver::Instance().Observe(gpair_, "Gradients");
 
@@ -1486,6 +1505,28 @@ class LearnerImpl : public LearnerIO {
   }
 
  private:
+  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
+                   HostDeviceVector<GradientPair>* out_gpair) {
+    // Special handling for vertical federated learning.
+    if (collective::IsFederated() && info.data_split_mode == DataSplitMode::kCol) {
+      // We assume labels are only available on worker 0, so the gradients are calculated there
+      // and broadcast to other workers.
+      if (collective::GetRank() == 0) {
+        obj_->GetGradient(preds, info, iteration, out_gpair);
+        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
+                              0);
+      } else {
+        CHECK_EQ(info.labels.Size(), 0)
+            << "In vertical federated learning, labels should only be on the first worker";
+        out_gpair->Resize(preds.Size());
+        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
+                              0);
+      }
+    } else {
+      obj_->GetGradient(preds, info, iteration, out_gpair);
+    }
+  }
+
   /*! \brief random number transformation seed. */
   static int32_t constexpr kRandSeedMagic = 127;
   // gradient pairs
diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc
index 96fd5d653..938ceb59d 100644
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -33,7 +33,7 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
   new_obj->GetGradient(dummy_predt, info, 0, &gpair);
   bst_target_t n_targets = this->Targets(info);
   linalg::Vector<float> leaf_weight;
-  tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
+  tree::FitStump(this->ctx_, info, gpair, n_targets, &leaf_weight);
 
   // workaround, we don't support multi-target due to binary model serialization for
   // base margin.
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index ad0253d22..5131f9284 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -21,7 +21,8 @@
 namespace xgboost {
 namespace tree {
 namespace cpu_impl {
-void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
+void FitStump(Context const* ctx, MetaInfo const& info,
+              linalg::TensorView<GradientPair const, 2> gpair,
               linalg::VectorView<float> out) {
   auto n_targets = out.Size();
   CHECK_EQ(n_targets, gpair.Shape(1));
@@ -43,8 +44,12 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
     }
   }
   CHECK(h_sum.CContiguous());
-  collective::Allreduce<collective::Operation::kSum>(
-      reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+
+  // In vertical federated learning, only worker 0 needs to call this, no need to do an allreduce.
+  if (!collective::IsFederated() || info.data_split_mode != DataSplitMode::kCol) {
+    collective::Allreduce<collective::Operation::kSum>(
+        reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+  }
 
   for (std::size_t i = 0; i < h_sum.Size(); ++i) {
     out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
@@ -64,7 +69,7 @@ inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace cuda_impl
 
-void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out) {
   out->SetDevice(ctx->gpu_id);
   out->Reshape(n_targets);
@@ -72,7 +77,7 @@ void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
 
   gpair.SetDevice(ctx->gpu_id);
   auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets);
-  ctx->IsCPU() ? cpu_impl::FitStump(ctx, gpair_t, out->HostView())
+  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
                : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
 }
 }  // namespace tree
diff --git a/src/tree/fit_stump.h b/src/tree/fit_stump.h
index 1f5cd60b4..4778ecfc5 100644
--- a/src/tree/fit_stump.h
+++ b/src/tree/fit_stump.h
@@ -16,6 +16,7 @@
 #include "../common/common.h"            // AssertGPUSupport
 #include "xgboost/base.h"                // GradientPair
 #include "xgboost/context.h"             // Context
+#include "xgboost/data.h"                // MetaInfo
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // TensorView
 
@@ -30,7 +31,7 @@ XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
 /**
  * @brief Fit a tree stump as an estimation of base_score.
  */
-void FitStump(Context const* ctx, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out);
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc
index c37328192..99cd72cc0 100644
--- a/tests/cpp/data/test_data.cc
+++ b/tests/cpp/data/test_data.cc
@@ -112,31 +112,12 @@ TEST(SparsePage, SortIndices) {
 }
 
 TEST(DMatrix, Uri) {
-  size_t constexpr kRows {16};
-  size_t constexpr kCols {8};
-  std::vector<float> data (kRows * kCols);
-
-  for (size_t i = 0; i < kRows * kCols; ++i) {
-    data[i] = i;
-  }
+  auto constexpr kRows {16};
+  auto constexpr kCols {8};
 
   dmlc::TemporaryDirectory tmpdir;
-  std::string path = tmpdir.path + "/small.csv";
-
-  std::ofstream fout(path);
-  size_t i = 0;
-  for (size_t r = 0; r < kRows; ++r) {
-    for (size_t c = 0; c < kCols; ++c) {
-      fout << data[i];
-      i++;
-      if (c != kCols - 1) {
-        fout << ",";
-      }
-    }
-    fout << "\n";
-  }
-  fout.flush();
-  fout.close();
+  auto const path = tmpdir.path + "/small.csv";
+  CreateTestCSV(path, kRows, kCols);
 
   std::unique_ptr<DMatrix> dmat;
   // FIXME(trivialfis): Enable the following test by restricting csv parser in dmlc-core.
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 9236f569f..49813f1d0 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -65,6 +65,29 @@ void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_
   }
 }
 
+void CreateTestCSV(std::string const& path, size_t rows, size_t cols) {
+  std::vector<float> data(rows * cols);
+
+  for (size_t i = 0; i < rows * cols; ++i) {
+    data[i] = i;
+  }
+
+  std::ofstream fout(path);
+  size_t i = 0;
+  for (size_t r = 0; r < rows; ++r) {
+    for (size_t c = 0; c < cols; ++c) {
+      fout << data[i];
+      i++;
+      if (c != cols - 1) {
+        fout << ",";
+      }
+    }
+    fout << "\n";
+  }
+  fout.flush();
+  fout.close();
+}
+
 void CheckObjFunctionImpl(std::unique_ptr<xgboost::ObjFunction> const& obj,
                           std::vector<xgboost::bst_float> preds,
                           std::vector<xgboost::bst_float> labels,
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 279e3f759..a059f0436 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -59,6 +59,8 @@ void CreateSimpleTestData(const std::string& filename);
 // 0-based indexing.
 void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);
 
+void CreateTestCSV(std::string const& path, size_t rows, size_t cols);
+
 void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                       std::vector<xgboost::bst_float> preds,
                       std::vector<xgboost::bst_float> labels,
diff --git a/tests/cpp/plugin/helpers.cc b/tests/cpp/plugin/helpers.cc
deleted file mode 100644
index a70479b1b..000000000
--- a/tests/cpp/plugin/helpers.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <chrono>
-#include <thread>
-#include <random>
-#include <cstdint>
-
-#include "helpers.h"
-
-using namespace std::chrono_literals;
-
-int GenerateRandomPort(int low, int high) {
-  // Ensure unique timestamp by introducing a small artificial delay
-  std::this_thread::sleep_for(100ms);
-  auto timestamp = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(
-    std::chrono::system_clock::now().time_since_epoch()).count());
-  std::mt19937_64 rng(timestamp);
-  std::uniform_int_distribution<int> dist(low, high);
-  int port = dist(rng);
-  return port;
-}
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index ea72f1538..0ac6746f8 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -1,10 +1,69 @@
 /*!
- * Copyright 2022 XGBoost contributors
+ * Copyright 2022-2023 XGBoost contributors
  */
+#pragma once
 
-#ifndef XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
-#define XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
+#include <grpcpp/server_builder.h>
+#include <gtest/gtest.h>
+#include <xgboost/json.h>
 
-int GenerateRandomPort(int low, int high);
+#include <random>
 
-#endif  // XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
+#include "../../../plugin/federated/federated_server.h"
+#include "../../../src/collective/communicator-inl.h"
+
+inline int GenerateRandomPort(int low, int high) {
+  using namespace std::chrono_literals;
+  // Ensure unique timestamp by introducing a small artificial delay
+  std::this_thread::sleep_for(100ms);
+  auto timestamp = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(
+                                             std::chrono::system_clock::now().time_since_epoch())
+                                             .count());
+  std::mt19937_64 rng(timestamp);
+  std::uniform_int_distribution<int> dist(low, high);
+  int port = dist(rng);
+  return port;
+}
+
+inline std::string GetServerAddress() {
+  int port = GenerateRandomPort(50000, 60000);
+  std::string address = std::string("localhost:") + std::to_string(port);
+  return address;
+}
+
+namespace xgboost {
+
+class BaseFederatedTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    server_address_ = GetServerAddress();
+    server_thread_.reset(new std::thread([this] {
+      grpc::ServerBuilder builder;
+      xgboost::federated::FederatedService service{kWorldSize};
+      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
+      builder.RegisterService(&service);
+      server_ = builder.BuildAndStart();
+      server_->Wait();
+    }));
+  }
+
+  void TearDown() override {
+    server_->Shutdown();
+    server_thread_->join();
+  }
+
+  void InitCommunicator(int rank) {
+    Json config{JsonObject()};
+    config["xgboost_communicator"] = String("federated");
+    config["federated_server_address"] = String(server_address_);
+    config["federated_world_size"] = kWorldSize;
+    config["federated_rank"] = rank;
+    xgboost::collective::Init(config);
+  }
+
+  static int const kWorldSize{3};
+  std::string server_address_;
+  std::unique_ptr<std::thread> server_thread_;
+  std::unique_ptr<grpc::Server> server_;
+};
+}  // namespace xgboost
diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu
index 794c60909..c4816ff18 100644
--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -1,56 +1,20 @@
 /*!
  * Copyright 2022 XGBoost contributors
  */
-#include <grpcpp/server_builder.h>
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>
 
+#include <ctime>
 #include <iostream>
 #include <thread>
-#include <ctime>
 
-#include "./helpers.h"
 #include "../../../plugin/federated/federated_communicator.h"
-#include "../../../plugin/federated/federated_server.h"
 #include "../../../src/collective/device_communicator_adapter.cuh"
+#include "./helpers.h"
 
-namespace {
+namespace xgboost::collective {
 
-std::string GetServerAddress() {
-  int port = GenerateRandomPort(50000, 60000);
-  std::string address = std::string("localhost:") + std::to_string(port);
-  return address;
-}
-
-}  // anonymous namespace
-
-namespace xgboost {
-namespace collective {
-
-class FederatedAdapterTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    server_address_ = GetServerAddress();
-    server_thread_.reset(new std::thread([this] {
-      grpc::ServerBuilder builder;
-      federated::FederatedService service{kWorldSize};
-      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
-      builder.RegisterService(&service);
-      server_ = builder.BuildAndStart();
-      server_->Wait();
-    }));
-  }
-
-  void TearDown() override {
-    server_->Shutdown();
-    server_thread_->join();
-  }
-
-  static int const kWorldSize{2};
-  std::string server_address_;
-  std::unique_ptr<std::thread> server_thread_;
-  std::unique_ptr<grpc::Server> server_;
-};
+class FederatedAdapterTest : public BaseFederatedTest {};
 
 TEST(FederatedAdapterSimpleTest, ThrowOnInvalidDeviceOrdinal) {
   auto construct = []() { DeviceCommunicatorAdapter adapter{-1, nullptr}; };
@@ -65,20 +29,20 @@ TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
 TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread([rank, server_address=server_address_] {
+    threads.emplace_back([rank, server_address = server_address_] {
       FederatedCommunicator comm{kWorldSize, rank, server_address};
       // Assign device 0 to all workers, since we run gtest in a single-GPU machine
       DeviceCommunicatorAdapter adapter{0, &comm};
-      int const count = 3;
+      int count = 3;
       thrust::device_vector<double> buffer(count, 0);
       thrust::sequence(buffer.begin(), buffer.end());
       adapter.AllReduceSum(buffer.data().get(), count);
       thrust::host_vector<double> host_buffer = buffer;
       EXPECT_EQ(host_buffer.size(), count);
       for (auto i = 0; i < count; i++) {
-        EXPECT_EQ(host_buffer[i], i * 2);
+        EXPECT_EQ(host_buffer[i], i * kWorldSize);
       }
-    }));
+    });
   }
   for (auto& thread : threads) {
     thread.join();
@@ -88,7 +52,7 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
 TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread([rank, server_address=server_address_] {
+    threads.emplace_back([rank, server_address = server_address_] {
       FederatedCommunicator comm{kWorldSize, rank, server_address};
       // Assign device 0 to all workers, since we run gtest in a single-GPU machine
       DeviceCommunicatorAdapter adapter{0, &comm};
@@ -104,17 +68,16 @@ TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
       EXPECT_EQ(segments[0], 2);
       EXPECT_EQ(segments[1], 3);
       thrust::host_vector<char> host_buffer = receive_buffer;
-      EXPECT_EQ(host_buffer.size(), 5);
-      int expected[] = {0, 1, 0, 1, 2};
-      for (auto i = 0; i < 5; i++) {
+      EXPECT_EQ(host_buffer.size(), 9);
+      int expected[] = {0, 1, 0, 1, 2, 0, 1, 2, 3};
+      for (auto i = 0; i < 9; i++) {
         EXPECT_EQ(host_buffer[i], expected[i]);
       }
-    }));
+    });
   }
   for (auto& thread : threads) {
     thread.join();
   }
 }
 
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc
index f5d72e5f4..5177187c5 100644
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -2,65 +2,34 @@
  * Copyright 2022 XGBoost contributors
  */
 #include <dmlc/parameter.h>
-#include <grpcpp/server_builder.h>
 #include <gtest/gtest.h>
 
 #include <iostream>
 #include <thread>
-#include <ctime>
 
-#include "helpers.h"
 #include "../../../plugin/federated/federated_communicator.h"
-#include "../../../plugin/federated/federated_server.h"
+#include "helpers.h"
 
-namespace {
+namespace xgboost::collective {
 
-std::string GetServerAddress() {
-  int port = GenerateRandomPort(50000, 60000);
-  std::string address = std::string("localhost:") + std::to_string(port);
-  return address;
-}
-
-}  // anonymous namespace
-
-namespace xgboost {
-namespace collective {
-
-class FederatedCommunicatorTest : public ::testing::Test {
+class FederatedCommunicatorTest : public BaseFederatedTest {
  public:
-  static void VerifyAllgather(int rank, const std::string& server_address) {
+  static void VerifyAllgather(int rank, const std::string &server_address) {
     FederatedCommunicator comm{kWorldSize, rank, server_address};
     CheckAllgather(comm, rank);
   }
 
-  static void VerifyAllreduce(int rank, const std::string& server_address) {
+  static void VerifyAllreduce(int rank, const std::string &server_address) {
     FederatedCommunicator comm{kWorldSize, rank, server_address};
     CheckAllreduce(comm);
   }
 
-  static void VerifyBroadcast(int rank, const std::string& server_address) {
+  static void VerifyBroadcast(int rank, const std::string &server_address) {
     FederatedCommunicator comm{kWorldSize, rank, server_address};
     CheckBroadcast(comm, rank);
   }
 
  protected:
-  void SetUp() override {
-    server_address_ = GetServerAddress();
-    server_thread_.reset(new std::thread([this] {
-      grpc::ServerBuilder builder;
-      federated::FederatedService service{kWorldSize};
-      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
-      builder.RegisterService(&service);
-      server_ = builder.BuildAndStart();
-      server_->Wait();
-    }));
-  }
-
-  void TearDown() override {
-    server_->Shutdown();
-    server_thread_->join();
-  }
-
   static void CheckAllgather(FederatedCommunicator &comm, int rank) {
     int buffer[kWorldSize] = {0, 0, 0};
     buffer[rank] = rank;
@@ -90,11 +59,6 @@ class FederatedCommunicatorTest : public ::testing::Test {
       EXPECT_EQ(buffer, "hello");
     }
   }
-
-  static int const kWorldSize{3};
-  std::string server_address_;
-  std::unique_ptr<std::thread> server_thread_;
-  std::unique_ptr<grpc::Server> server_;
 };
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
@@ -161,8 +125,7 @@ TEST(FederatedCommunicatorSimpleTest, IsDistributed) {
 TEST_F(FederatedCommunicatorTest, Allgather) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(
-        std::thread(&FederatedCommunicatorTest::VerifyAllgather, rank, server_address_));
+    threads.emplace_back(&FederatedCommunicatorTest::VerifyAllgather, rank, server_address_);
   }
   for (auto &thread : threads) {
     thread.join();
@@ -172,8 +135,7 @@ TEST_F(FederatedCommunicatorTest, Allgather) {
 TEST_F(FederatedCommunicatorTest, Allreduce) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(
-        std::thread(&FederatedCommunicatorTest::VerifyAllreduce, rank, server_address_));
+    threads.emplace_back(&FederatedCommunicatorTest::VerifyAllreduce, rank, server_address_);
   }
   for (auto &thread : threads) {
     thread.join();
@@ -183,12 +145,10 @@ TEST_F(FederatedCommunicatorTest, Allreduce) {
 TEST_F(FederatedCommunicatorTest, Broadcast) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(
-        std::thread(&FederatedCommunicatorTest::VerifyBroadcast, rank, server_address_));
+    threads.emplace_back(&FederatedCommunicatorTest::VerifyBroadcast, rank, server_address_);
   }
   for (auto &thread : threads) {
     thread.join();
   }
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/tests/cpp/plugin/test_federated_data.cc b/tests/cpp/plugin/test_federated_data.cc
new file mode 100644
index 000000000..8ac89e887
--- /dev/null
+++ b/tests/cpp/plugin/test_federated_data.cc
@@ -0,0 +1,65 @@
+/*!
+ * Copyright 2023 XGBoost contributors
+ */
+#include <dmlc/parameter.h>
+#include <gtest/gtest.h>
+#include <xgboost/data.h>
+
+#include <fstream>
+#include <iostream>
+#include <thread>
+
+#include "../../../plugin/federated/federated_server.h"
+#include "../../../src/collective/communicator-inl.h"
+#include "../filesystem.h"
+#include "../helpers.h"
+#include "helpers.h"
+
+namespace xgboost {
+
+class FederatedDataTest : public BaseFederatedTest {
+ public:
+  void VerifyLoadUri(int rank) {
+    InitCommunicator(rank);
+
+    size_t constexpr kRows{16};
+    size_t const kCols = 8 + rank;
+
+    dmlc::TemporaryDirectory tmpdir;
+    std::string path = tmpdir.path + "/small" + std::to_string(rank) + ".csv";
+    CreateTestCSV(path, kRows, kCols);
+
+    std::unique_ptr<DMatrix> dmat;
+    std::string uri = path + "?format=csv";
+    dmat.reset(DMatrix::Load(uri, false, DataSplitMode::kCol));
+
+    ASSERT_EQ(dmat->Info().num_col_, 8 * kWorldSize + 3);
+    ASSERT_EQ(dmat->Info().num_row_, kRows);
+
+    for (auto const& page : dmat->GetBatches<SparsePage>()) {
+      auto entries = page.GetView().data;
+      auto index = 0;
+      int offsets[] = {0, 8, 17};
+      int offset = offsets[rank];
+      for (auto row = 0; row < kRows; row++) {
+        for (auto col = 0; col < kCols; col++) {
+          EXPECT_EQ(entries[index].index, col + offset);
+          index++;
+        }
+      }
+    }
+
+    xgboost::collective::Finalize();
+  }
+};
+
+TEST_F(FederatedDataTest, LoadUri) {
+  std::vector<std::thread> threads;
+  for (auto rank = 0; rank < kWorldSize; rank++) {
+    threads.emplace_back(&FederatedDataTest_LoadUri_Test::VerifyLoadUri, this, rank);
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+}  // namespace xgboost
diff --git a/tests/cpp/plugin/test_federated_server.cc b/tests/cpp/plugin/test_federated_server.cc
index fa9c272d2..79e06bf5f 100644
--- a/tests/cpp/plugin/test_federated_server.cc
+++ b/tests/cpp/plugin/test_federated_server.cc
@@ -1,30 +1,17 @@
 /*!
  * Copyright 2017-2020 XGBoost contributors
  */
-#include <grpcpp/server_builder.h>
 #include <gtest/gtest.h>
 
-#include <ctime>
 #include <iostream>
 #include <thread>
 
 #include "federated_client.h"
-#include "federated_server.h"
 #include "helpers.h"
 
-namespace {
-
-std::string GetServerAddress() {
-  int port = GenerateRandomPort(50000, 60000);
-  std::string address = std::string("localhost:") + std::to_string(port);
-  return address;
-}
-
-}  // anonymous namespace
-
 namespace xgboost {
 
-class FederatedServerTest : public ::testing::Test {
+class FederatedServerTest : public BaseFederatedTest {
  public:
   static void VerifyAllgather(int rank, const std::string& server_address) {
     federated::FederatedClient client{server_address, rank};
@@ -51,23 +38,6 @@ class FederatedServerTest : public ::testing::Test {
   }
 
  protected:
-  void SetUp() override {
-    server_address_ = GetServerAddress();
-    server_thread_.reset(new std::thread([this] {
-      grpc::ServerBuilder builder;
-      federated::FederatedService service{kWorldSize};
-      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
-      builder.RegisterService(&service);
-      server_ = builder.BuildAndStart();
-      server_->Wait();
-    }));
-  }
-
-  void TearDown() override {
-    server_->Shutdown();
-    server_thread_->join();
-  }
-
   static void CheckAllgather(federated::FederatedClient& client, int rank) {
     int data[kWorldSize] = {0, 0, 0};
     data[rank] = rank;
@@ -98,17 +68,12 @@ class FederatedServerTest : public ::testing::Test {
     auto reply = client.Broadcast(send_buffer, 0);
     EXPECT_EQ(reply, "hello broadcast") << "rank " << rank;
   }
-
-  static int const kWorldSize{3};
-  std::string server_address_;
-  std::unique_ptr<std::thread> server_thread_;
-  std::unique_ptr<grpc::Server> server_;
 };
 
 TEST_F(FederatedServerTest, Allgather) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllgather, rank, server_address_));
+    threads.emplace_back(&FederatedServerTest::VerifyAllgather, rank, server_address_);
   }
   for (auto& thread : threads) {
     thread.join();
@@ -118,7 +83,7 @@ TEST_F(FederatedServerTest, Allgather) {
 TEST_F(FederatedServerTest, Allreduce) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllreduce, rank, server_address_));
+    threads.emplace_back(&FederatedServerTest::VerifyAllreduce, rank, server_address_);
   }
   for (auto& thread : threads) {
     thread.join();
@@ -128,7 +93,7 @@ TEST_F(FederatedServerTest, Allreduce) {
 TEST_F(FederatedServerTest, Broadcast) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread(&FederatedServerTest::VerifyBroadcast, rank, server_address_));
+    threads.emplace_back(&FederatedServerTest::VerifyBroadcast, rank, server_address_);
   }
   for (auto& thread : threads) {
     thread.join();
@@ -138,7 +103,7 @@ TEST_F(FederatedServerTest, Broadcast) {
 TEST_F(FederatedServerTest, Mixture) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back(std::thread(&FederatedServerTest::VerifyMixture, rank, server_address_));
+    threads.emplace_back(&FederatedServerTest::VerifyMixture, rank, server_address_);
   }
   for (auto& thread : threads) {
     thread.join();
diff --git a/tests/cpp/tree/test_fit_stump.cc b/tests/cpp/tree/test_fit_stump.cc
index ef608e575..35a6af994 100644
--- a/tests/cpp/tree/test_fit_stump.cc
+++ b/tests/cpp/tree/test_fit_stump.cc
@@ -21,7 +21,8 @@ void TestFitStump(Context const *ctx) {
     }
   }
   linalg::Vector<float> out;
-  FitStump(ctx, gpair, kTargets, &out);
+  MetaInfo info;
+  FitStump(ctx, info, gpair, kTargets, &out);
   auto h_out = out.HostView();
   for (auto it = linalg::cbegin(h_out); it != linalg::cend(h_out); ++it) {
     // sum_hess == kRows

From 5891f752c8fad9627a6d6b2e5e911b83db749994 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Mar 2023 17:45:20 +0800
Subject: [PATCH 24/32] Rework the MAP metric. (#8931)

- The new implementation is more strict as only binary labels are accepted. The previous implementation converts values greater than 1 to 1.
- Deterministic GPU. (no atomic add).
- Fix top-k handling.
- Precise definition of MAP. (There are other variants on how to handle top-k).
- Refactor GPU ranking tests.
---
 doc/parameter.rst                          |  13 +-
 python-package/xgboost/testing/__init__.py |   9 +-
 python-package/xgboost/testing/ranking.py  |   7 +-
 src/common/error_msg.h                     |   2 +-
 src/common/numeric.h                       |  26 +-
 src/common/ranking_utils.cc                |   9 +
 src/common/ranking_utils.cu                |   5 +
 src/common/ranking_utils.h                 |  65 +++++
 src/common/threading_utils.h               |  10 +-
 src/metric/rank_metric.cc                  |  97 ++++---
 src/metric/rank_metric.cu                  | 161 ++++++------
 src/metric/rank_metric.h                   |  13 +-
 tests/cpp/common/test_ranking_utils.cc     |  32 +++
 tests/cpp/common/test_ranking_utils.cu     |   6 +
 tests/cpp/common/test_ranking_utils.h      |   2 +
 tests/cpp/metric/test_rank_metric.cc       |   2 +-
 tests/python-gpu/test_gpu_ranking.py       | 278 ++++++++-------------
 tests/python/test_with_sklearn.py          |  44 +++-
 18 files changed, 458 insertions(+), 323 deletions(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 99d6f0585..ac566af74 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -408,8 +408,17 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
     - ``ndcg``: `Normalized Discounted Cumulative Gain <http://en.wikipedia.org/wiki/NDCG>`_
     - ``map``: `Mean Average Precision <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_
-    - ``ndcg@n``, ``map@n``: 'n' can be assigned as an integer to cut off the top positions in the lists for evaluation.
-    - ``ndcg-``, ``map-``, ``ndcg@n-``, ``map@n-``: In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions.
+
+      The `average precision` is defined as:
+
+      .. math::
+
+	 AP@l = \frac{1}{min{(l, N)}}\sum^l_{k=1}P@k \cdot I_{(k)}
+
+      where :math:`I_{(k)}` is an indicator function that equals to :math:`1` when the document at :math:`k` is relevant and :math:`0` otherwise. The :math:`P@k` is the precision at :math:`k`, and :math:`N` is the total number of relevant documents. Lastly, the `mean average precision` is defined as the weighted average across all queries.
+
+    - ``ndcg@n``, ``map@n``: :math:`n` can be assigned as an integer to cut off the top positions in the lists for evaluation.
+    - ``ndcg-``, ``map-``, ``ndcg@n-``, ``map@n-``: In XGBoost, the NDCG and MAP evaluate the score of a list without any positive samples as :math:`1`. By appending "-" to the evaluation metric name, we can ask XGBoost to evaluate these scores as :math:`0` to be consistent under some conditions.
     - ``poisson-nloglik``: negative log-likelihood for Poisson regression
     - ``gamma-nloglik``: negative log-likelihood for gamma regression
     - ``cox-nloglik``: negative partial log-likelihood for Cox proportional hazards regression
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 3b33e8774..bb13b5523 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -14,6 +14,7 @@ import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
+from pathlib import Path
 from platform import system
 from typing import (
     Any,
@@ -443,7 +444,7 @@ def get_mq2008(
     from sklearn.datasets import load_svmlight_files
 
     src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
-    target = dpath + "/MQ2008.zip"
+    target = os.path.join(os.path.expanduser(dpath), "MQ2008.zip")
     if not os.path.exists(target):
         request.urlretrieve(url=src, filename=target)
 
@@ -462,9 +463,9 @@ def get_mq2008(
         qid_valid,
     ) = load_svmlight_files(
         (
-            dpath + "MQ2008/Fold1/train.txt",
-            dpath + "MQ2008/Fold1/test.txt",
-            dpath + "MQ2008/Fold1/vali.txt",
+            Path(dpath) / "MQ2008" / "Fold1" / "train.txt",
+            Path(dpath) / "MQ2008" / "Fold1" / "test.txt",
+            Path(dpath) / "MQ2008" / "Fold1" / "vali.txt",
         ),
         query_id=True,
         zero_based=False,
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
index fe4fc8404..7c75012c2 100644
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -48,7 +48,12 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
     def neg_mse(*args: Any, **kwargs: Any) -> float:
         return -float(mean_squared_error(*args, **kwargs))
 
-    ranker = xgb.XGBRanker(n_estimators=3, eval_metric=neg_mse, tree_method=tree_method)
+    ranker = xgb.XGBRanker(
+        n_estimators=3,
+        eval_metric=neg_mse,
+        tree_method=tree_method,
+        disable_default_eval_metric=True,
+    )
     ranker.fit(df, y, eval_set=[(valid_df, y)])
     score = ranker.score(valid_df, y)
     assert np.isclose(score, ranker.evals_result()["validation_0"]["neg_mse"][-1])
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 484595316..3dbb7f52c 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -22,7 +22,7 @@ constexpr StringView LabelScoreSize() {
 }
 
 constexpr StringView InfInData() {
-  return "Input data contains `inf` while `missing` is not set to `inf`";
+  return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
 }
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/common/numeric.h b/src/common/numeric.h
index 6a1c15fd0..2da85502a 100644
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -1,13 +1,15 @@
-/*!
- * Copyright 2022, XGBoost contributors.
+/**
+ * Copyright 2022-2023 by XGBoost contributors.
  */
 #ifndef XGBOOST_COMMON_NUMERIC_H_
 #define XGBOOST_COMMON_NUMERIC_H_
 
 #include <dmlc/common.h>  // OMPException
 
-#include <algorithm>  // std::max
-#include <iterator>   // std::iterator_traits
+#include <algorithm>  // for std::max
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t
+#include <iterator>   // for iterator_traits
 #include <vector>
 
 #include "common.h"                      // AssertGPUSupport
@@ -15,8 +17,7 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 
 /**
  * \brief Run length encode on CPU, input must be sorted.
@@ -111,11 +112,11 @@ inline double Reduce(Context const*, HostDeviceVector<float> const&) {
 namespace cpu_impl {
 template <typename It, typename V = typename It::value_type>
 V Reduce(Context const* ctx, It first, It second, V const& init) {
-  size_t n = std::distance(first, second);
-  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(ctx->Threads(), init);
-  common::ParallelFor(n, ctx->Threads(),
-                      [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
-  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + ctx->Threads(), init);
+  std::size_t n = std::distance(first, second);
+  auto n_threads = static_cast<std::size_t>(std::min(n, static_cast<std::size_t>(ctx->Threads())));
+  common::MemStackAllocator<V, common::DefaultMaxThreads()> result_tloc(n_threads, init);
+  common::ParallelFor(n, n_threads, [&](auto i) { result_tloc[omp_get_thread_num()] += first[i]; });
+  auto result = std::accumulate(result_tloc.cbegin(), result_tloc.cbegin() + n_threads, init);
   return result;
 }
 }  // namespace cpu_impl
@@ -144,7 +145,6 @@ void Iota(Context const* ctx, It first, It last,
     });
   }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 
 #endif  // XGBOOST_COMMON_NUMERIC_H_
diff --git a/src/common/ranking_utils.cc b/src/common/ranking_utils.cc
index c8069784b..d831b551c 100644
--- a/src/common/ranking_utils.cc
+++ b/src/common/ranking_utils.cc
@@ -114,6 +114,15 @@ void NDCGCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUS
 
 DMLC_REGISTER_PARAMETER(LambdaRankParam);
 
+void MAPCache::InitOnCPU(Context const*, MetaInfo const& info) {
+  auto const& h_label = info.labels.HostView().Slice(linalg::All(), 0);
+  CheckMapLabels(h_label, [](auto beg, auto end, auto op) { return std::all_of(beg, end, op); });
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+void MAPCache::InitOnCUDA(Context const*, MetaInfo const&) { common::AssertGPUSupport(); }
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 std::string ParseMetricName(StringView name, StringView param, position_t* topn, bool* minus) {
   std::string out_name;
   if (!param.empty()) {
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
index ce9cda4e2..8fbf89818 100644
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -204,4 +204,9 @@ void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
   dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
               [=] XGBOOST_DEVICE(std::size_t i) { d_discount[i] = CalcDCGDiscount(i); });
 }
+
+void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
+  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  CheckMapLabels(d_label, CheckMAPOp{ctx->CUDACtx()});
+}
 }  // namespace xgboost::ltr
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index 88283fba2..727f918f2 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -358,6 +358,71 @@ void CheckNDCGLabels(ltr::LambdaRankParam const& p, linalg::VectorView<float con
   }
 }
 
+template <typename AllOf>
+bool IsBinaryRel(linalg::VectorView<float const> label, AllOf all_of) {
+  auto s_label = label.Values();
+  return all_of(s_label.data(), s_label.data() + s_label.size(), [] XGBOOST_DEVICE(float y) {
+    return std::abs(y - 1.0f) < kRtEps || std::abs(y - 0.0f) < kRtEps;
+  });
+}
+/**
+ * \brief Validate label for MAP
+ *
+ * \tparam Implementation of std::all_of. Specified as a parameter to reuse the check for
+ *         both CPU and GPU.
+ */
+template <typename AllOf>
+void CheckMapLabels(linalg::VectorView<float const> label, AllOf all_of) {
+  auto s_label = label.Values();
+  auto is_binary = IsBinaryRel(label, all_of);
+  CHECK(is_binary) << "MAP can only be used with binary labels.";
+}
+
+class MAPCache : public RankingCache {
+  // Total number of relevant documents for each group
+  HostDeviceVector<double> n_rel_;
+  // \sum l_k/k
+  HostDeviceVector<double> acc_;
+  HostDeviceVector<double> map_;
+  // Number of samples in this dataset.
+  std::size_t n_samples_{0};
+
+  void InitOnCPU(Context const* ctx, MetaInfo const& info);
+  void InitOnCUDA(Context const* ctx, MetaInfo const& info);
+
+ public:
+  MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
+      : RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
+    if (ctx->IsCPU()) {
+      this->InitOnCPU(ctx, info);
+    } else {
+      this->InitOnCUDA(ctx, info);
+    }
+  }
+
+  common::Span<double> NumRelevant(Context const* ctx) {
+    if (n_rel_.Empty()) {
+      n_rel_.SetDevice(ctx->gpu_id);
+      n_rel_.Resize(n_samples_);
+    }
+    return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
+  }
+  common::Span<double> Acc(Context const* ctx) {
+    if (acc_.Empty()) {
+      acc_.SetDevice(ctx->gpu_id);
+      acc_.Resize(n_samples_);
+    }
+    return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
+  }
+  common::Span<double> Map(Context const* ctx) {
+    if (map_.Empty()) {
+      map_.SetDevice(ctx->gpu_id);
+      map_.Resize(this->Groups());
+    }
+    return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
+  }
+};
+
 /**
  * \brief Parse name for ranking metric given parameters.
  *
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index a52695e02..d80008cc0 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -8,9 +8,11 @@
 #include <dmlc/omp.h>
 
 #include <algorithm>
-#include <cstdint>  // std::int32_t
+#include <cstdint>  // for int32_t
+#include <cstdlib>  // for malloc, free
 #include <limits>
-#include <type_traits>  // std::is_signed
+#include <new>          // for bad_alloc
+#include <type_traits>  // for is_signed
 #include <vector>
 
 #include "xgboost/logging.h"
@@ -266,7 +268,7 @@ class MemStackAllocator {
     if (MaxStackSize >= required_size_) {
       ptr_ = stack_mem_;
     } else {
-      ptr_ = reinterpret_cast<T*>(malloc(required_size_ * sizeof(T)));
+      ptr_ = reinterpret_cast<T*>(std::malloc(required_size_ * sizeof(T)));
     }
     if (!ptr_) {
       throw std::bad_alloc{};
@@ -278,7 +280,7 @@ class MemStackAllocator {
 
   ~MemStackAllocator() {
     if (required_size_ > MaxStackSize) {
-      free(ptr_);
+      std::free(ptr_);
     }
   }
   T& operator[](size_t i) { return ptr_[i]; }
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index c2aa48cab..3a1416b0f 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -284,37 +284,6 @@ struct EvalPrecision : public EvalRank {
   }
 };
 
-/*! \brief Mean Average Precision at N, for both classification and rank */
-struct EvalMAP : public EvalRank {
- public:
-  explicit EvalMAP(const char* name, const char* param) : EvalRank(name, param) {}
-
-  double EvalGroup(PredIndPairContainer *recptr) const override {
-    PredIndPairContainer &rec(*recptr);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-    unsigned nhits = 0;
-    double sumap = 0.0;
-    for (size_t i = 0; i < rec.size(); ++i) {
-      if (rec[i].second != 0) {
-        nhits += 1;
-        if (i < this->topn) {
-          sumap += static_cast<double>(nhits) / (i + 1);
-        }
-      }
-    }
-    if (nhits != 0) {
-      sumap /= nhits;
-      return sumap;
-    } else {
-      if (this->minus) {
-        return 0.0;
-      } else {
-        return 1.0;
-      }
-    }
-  }
-};
-
 /*! \brief Cox: Partial likelihood of the Cox proportional hazards model */
 struct EvalCox : public MetricNoCache {
  public:
@@ -370,10 +339,6 @@ XGBOOST_REGISTER_METRIC(Precision, "pre")
 .describe("precision@k for rank.")
 .set_body([](const char* param) { return new EvalPrecision("pre", param); });
 
-XGBOOST_REGISTER_METRIC(MAP, "map")
-.describe("map@k for rank.")
-.set_body([](const char* param) { return new EvalMAP("map", param); });
-
 XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
 .describe("Negative log partial likelihood of Cox proportional hazards model.")
 .set_body([](const char*) { return new EvalCox(); });
@@ -516,6 +481,68 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
   }
 };
 
+class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
+ public:
+  using EvalRankWithCache::EvalRankWithCache;
+  const char* Name() const override { return name_.c_str(); }
+
+  double Eval(HostDeviceVector<float> const& predt, MetaInfo const& info,
+              std::shared_ptr<ltr::MAPCache> p_cache) override {
+    if (ctx_->IsCUDA()) {
+      auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
+      return Finalize(map.Residue(), map.Weights());
+    }
+
+    auto gptr = p_cache->DataGroupPtr(ctx_);
+    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
+    auto h_predt = linalg::MakeTensorView(ctx_, &predt, predt.Size());
+
+    auto map_gloc = p_cache->Map(ctx_);
+    std::fill_n(map_gloc.data(), map_gloc.size(), 0.0);
+    auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
+
+    common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
+      auto g_predt = h_predt.Slice(linalg::Range(gptr[g], gptr[g + 1]));
+      auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
+      auto g_rank = rank_idx.subspan(gptr[g]);
+
+      auto n = std::min(static_cast<std::size_t>(param_.TopK()), g_label.Size());
+      double n_hits{0.0};
+      for (std::size_t i = 0; i < n; ++i) {
+        auto p = g_label(g_rank[i]);
+        n_hits += p;
+        map_gloc[g] += n_hits / static_cast<double>((i + 1)) * p;
+      }
+      for (std::size_t i = n; i < g_label.Size(); ++i) {
+        n_hits += g_label(g_rank[i]);
+      }
+      if (n_hits > 0.0) {
+        map_gloc[g] /= std::min(n_hits, static_cast<double>(param_.TopK()));
+      } else {
+        map_gloc[g] = minus_ ? 0.0 : 1.0;
+      }
+    });
+
+    auto sw = 0.0;
+    auto weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    if (!weight.Empty()) {
+      CHECK_EQ(weight.weights.size(), p_cache->Groups());
+    }
+    for (std::size_t i = 0; i < map_gloc.size(); ++i) {
+      map_gloc[i] = map_gloc[i] * weight[i];
+      sw += weight[i];
+    }
+    auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
+    return Finalize(sum, sw);
+  }
+};
+
+XGBOOST_REGISTER_METRIC(EvalMAP, "map")
+    .describe("map@k for ranking.")
+    .set_body([](char const* param) {
+      return new EvalMAPScore{"map", param};
+    });
+
 XGBOOST_REGISTER_METRIC(EvalNDCG, "ndcg")
     .describe("ndcg@k for ranking.")
     .set_body([](char const* param) {
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index 4ab422a96..00116ebdb 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -125,89 +125,10 @@ struct EvalPrecisionGpu {
 };
 
 
-/*! \brief Mean Average Precision at N, for both classification and rank */
-struct EvalMAPGpu {
- public:
-  static double EvalMetric(const dh::SegmentSorter<float> &pred_sorter,
-                           const float *dlabels,
-                           const EvalRankConfig &ecfg) {
-    // Group info on device
-    const auto &dgroups = pred_sorter.GetGroupsSpan();
-    const auto ngroups = pred_sorter.GetNumGroups();
-    const auto &dgroup_idx = pred_sorter.GetGroupSegmentsSpan();
-
-    // Original positions of the predictions after they have been sorted
-    const auto &dpreds_orig_pos = pred_sorter.GetOriginalPositionsSpan();
-
-    // First, determine non zero labels in the dataset individually
-    const auto nitems = pred_sorter.GetNumItems();
-    dh::caching_device_vector<uint32_t> hits(nitems, 0);
-    auto DetermineNonTrivialLabelLambda = [=] __device__(uint32_t idx) {
-      return (static_cast<unsigned>(dlabels[dpreds_orig_pos[idx]]) != 0) ? 1 : 0;
-    };  // NOLINT
-
-    thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
-                      thrust::make_counting_iterator(nitems),
-                      hits.begin(),
-                      DetermineNonTrivialLabelLambda);
-
-    // Allocator to be used by sort for managing space overhead while performing prefix scans
-    dh::XGBCachingDeviceAllocator<char> alloc;
-
-    // Next, prefix scan the nontrivial labels that are segmented to accumulate them.
-    // This is required for computing the metric sum
-    // Data segmented into different groups...
-    thrust::inclusive_scan_by_key(thrust::cuda::par(alloc),
-                                  dh::tcbegin(dgroup_idx), dh::tcend(dgroup_idx),
-                                  hits.begin(),  // Input value
-                                  hits.begin());  // In-place scan
-
-    // Find each group's metric sum
-    dh::caching_device_vector<double> sumap(ngroups, 0);
-    auto *dsumap = sumap.data().get();
-    const auto *dhits = hits.data().get();
-
-    int device_id = -1;
-    dh::safe_cuda(cudaGetDevice(&device_id));
-    // For each group item compute the aggregated precision
-    dh::LaunchN(nitems, nullptr, [=] __device__(uint32_t idx) {
-      if (DetermineNonTrivialLabelLambda(idx)) {
-        const auto group_idx = dgroup_idx[idx];
-        const auto group_begin = dgroups[group_idx];
-        const auto ridx = idx - group_begin;
-        if (ridx < ecfg.topn) {
-          atomicAdd(&dsumap[group_idx],
-                    static_cast<double>(dhits[idx]) / (ridx + 1));
-        }
-      }
-    });
-
-    // Aggregate the group's item precisions
-    dh::LaunchN(ngroups, nullptr, [=] __device__(uint32_t gidx) {
-      auto nhits = dgroups[gidx + 1] ? dhits[dgroups[gidx + 1] - 1] : 0;
-      if (nhits != 0) {
-        dsumap[gidx] /= nhits;
-      } else {
-        if (ecfg.minus) {
-          dsumap[gidx] = 0;
-        } else {
-          dsumap[gidx] = 1;
-        }
-      }
-    });
-
-    return thrust::reduce(thrust::cuda::par(alloc), sumap.begin(), sumap.end());
-  }
-};
-
 XGBOOST_REGISTER_GPU_METRIC(PrecisionGpu, "pre")
 .describe("precision@k for rank computed on GPU.")
 .set_body([](const char* param) { return new EvalRankGpu<EvalPrecisionGpu>("pre", param); });
 
-XGBOOST_REGISTER_GPU_METRIC(MAPGpu, "map")
-.describe("map@k for rank computed on GPU.")
-.set_body([](const char* param) { return new EvalRankGpu<EvalMAPGpu>("map", param); });
-
 namespace cuda_impl {
 PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
                              HostDeviceVector<float> const &predt, bool minus,
@@ -245,5 +166,87 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
                              PackedReduceResult{0.0, 0.0});
   return pair;
 }
+
+PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
+                            HostDeviceVector<float> const &predt, bool minus,
+                            std::shared_ptr<ltr::MAPCache> p_cache) {
+  auto d_group_ptr = p_cache->DataGroupPtr(ctx);
+  auto n_groups = info.group_ptr_.size() - 1;
+  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+
+  predt.SetDevice(ctx->gpu_id);
+  auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) { return dh::SegmentId(d_group_ptr, i); });
+
+  auto get_label = [=] XGBOOST_DEVICE(std::size_t i) {
+    auto g = key_it[i];
+    auto g_begin = d_group_ptr[g];
+    auto g_end = d_group_ptr[g + 1];
+    i -= g_begin;
+    auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
+    auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
+    return g_label(g_rank[i]);
+  };
+  auto it = dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), get_label);
+
+  auto cuctx = ctx->CUDACtx();
+  auto n_rel = p_cache->NumRelevant(ctx);
+  thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + d_label.Size(), it, n_rel.data());
+
+  double topk = p_cache->Param().TopK();
+  auto map = p_cache->Map(ctx);
+  thrust::fill_n(cuctx->CTP(), map.data(), map.size(), 0.0);
+  {
+    auto val_it = dh::MakeTransformIterator<double>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) {
+          auto g = key_it[i];
+          auto g_begin = d_group_ptr[g];
+          auto g_end = d_group_ptr[g + 1];
+          i -= g_begin;
+          if (i >= topk) {
+            return 0.0;
+          }
+
+          auto g_label = d_label.Slice(linalg::Range(g_begin, g_end));
+          auto g_rank = d_rank_idx.subspan(g_begin, g_end - g_begin);
+          auto label = g_label(g_rank[i]);
+
+          auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
+          auto nhits = g_n_rel[i];
+          return nhits / static_cast<double>(i + 1) * label;
+        });
+
+    std::size_t bytes;
+    cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(), p_cache->Groups(),
+                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
+    dh::TemporaryArray<char> temp(bytes);
+    cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(), p_cache->Groups(),
+                                    d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
+  }
+
+  PackedReduceResult result{0.0, 0.0};
+  {
+    auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
+    if (!d_weight.Empty()) {
+      CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
+    }
+    auto val_it = dh::MakeTransformIterator<PackedReduceResult>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t g) {
+          auto g_begin = d_group_ptr[g];
+          auto g_end = d_group_ptr[g + 1];
+          auto g_n_rel = n_rel.subspan(g_begin, g_end - g_begin);
+          if (!g_n_rel.empty() && g_n_rel.back() > 0.0) {
+            return PackedReduceResult{map[g] * d_weight[g] / std::min(g_n_rel.back(), topk),
+                                      static_cast<double>(d_weight[g])};
+          }
+          return PackedReduceResult{minus ? 0.0 : 1.0, static_cast<double>(d_weight[g])};
+        });
+    result =
+        thrust::reduce(cuctx->CTP(), val_it, val_it + map.size(), PackedReduceResult{0.0, 0.0});
+  }
+  return result;
+}
 }  // namespace cuda_impl
 }  // namespace xgboost::metric
diff --git a/src/metric/rank_metric.h b/src/metric/rank_metric.h
index 0be0d4ee8..b3b121973 100644
--- a/src/metric/rank_metric.h
+++ b/src/metric/rank_metric.h
@@ -6,7 +6,7 @@
 #include <memory>                        // for shared_ptr
 
 #include "../common/common.h"            // for AssertGPUSupport
-#include "../common/ranking_utils.h"     // for NDCGCache
+#include "../common/ranking_utils.h"     // for NDCGCache, MAPCache
 #include "metric_common.h"               // for PackedReduceResult
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for MetaInfo
@@ -19,6 +19,10 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
                              HostDeviceVector<float> const &predt, bool minus,
                              std::shared_ptr<ltr::NDCGCache> p_cache);
 
+PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
+                            HostDeviceVector<float> const &predt, bool minus,
+                            std::shared_ptr<ltr::MAPCache> p_cache);
+
 #if !defined(XGBOOST_USE_CUDA)
 inline PackedReduceResult NDCGScore(Context const *, MetaInfo const &,
                                     HostDeviceVector<float> const &, bool,
@@ -26,6 +30,13 @@ inline PackedReduceResult NDCGScore(Context const *, MetaInfo const &,
   common::AssertGPUSupport();
   return {};
 }
+
+inline PackedReduceResult MAPScore(Context const *, MetaInfo const &,
+                                   HostDeviceVector<float> const &, bool,
+                                   std::shared_ptr<ltr::MAPCache>) {
+  common::AssertGPUSupport();
+  return {};
+}
 #endif
 }  // namespace cuda_impl
 }  // namespace metric
diff --git a/tests/cpp/common/test_ranking_utils.cc b/tests/cpp/common/test_ranking_utils.cc
index 9240db0d4..919102278 100644
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -177,4 +177,36 @@ TEST(NDCGCache, InitFromCPU) {
   Context ctx;
   TestNDCGCache(&ctx);
 }
+
+void TestMAPCache(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  std::vector<float> h_data(32);
+
+  common::Iota(ctx, h_data.begin(), h_data.end(), 0.0f);
+  info.labels.Reshape(h_data.size());
+  info.num_row_ = h_data.size();
+  info.labels.Data()->HostVector() = std::move(h_data);
+
+  auto fail = [&]() { std::make_shared<MAPCache>(ctx, info, param); };
+  // binary label
+  ASSERT_THROW(fail(), dmlc::Error);
+
+  h_data = std::vector<float>(32, 0.0f);
+  h_data[1] = 1.0f;
+  info.labels.Data()->HostVector() = h_data;
+  auto p_cache = std::make_shared<MAPCache>(ctx, info, param);
+
+  ASSERT_EQ(p_cache->Acc(ctx).size(), info.num_row_);
+  ASSERT_EQ(p_cache->NumRelevant(ctx).size(), info.num_row_);
+}
+
+TEST(MAPCache, InitFromCPU) {
+  Context ctx;
+  ctx.Init(Args{});
+  TestMAPCache(&ctx);
+}
 }  // namespace xgboost::ltr
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
index 5fda42c72..db0ff3b66 100644
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -95,4 +95,10 @@ TEST(NDCGCache, InitFromGPU) {
   ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
   TestNDCGCache(&ctx);
 }
+
+TEST(MAPCache, InitFromGPU) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestMAPCache(&ctx);
+}
 }  // namespace xgboost::ltr
diff --git a/tests/cpp/common/test_ranking_utils.h b/tests/cpp/common/test_ranking_utils.h
index ede687ff4..8ff92df9a 100644
--- a/tests/cpp/common/test_ranking_utils.h
+++ b/tests/cpp/common/test_ranking_utils.h
@@ -6,4 +6,6 @@
 
 namespace xgboost::ltr {
 void TestNDCGCache(Context const* ctx);
+
+void TestMAPCache(Context const* ctx);
 }  // namespace xgboost::ltr
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index 337ddbc8a..3e1028c48 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -141,7 +141,7 @@ TEST(Metric, DeclareUnifiedTest(MAP)) {
   // Rank metric with group info
   EXPECT_NEAR(GetMetricEval(metric,
                             {0.1f, 0.9f, 0.2f, 0.8f, 0.4f, 1.7f},
-                            {2, 7, 1, 0, 5, 0},  // Labels
+                            {1, 1, 1, 0, 1, 0},  // Labels
                             {},  // Weights
                             {0, 2, 5, 6}),  // Group info
               0.8611f, 0.001f);
diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py
index d86c1aa14..b8be5dda1 100644
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -1,194 +1,130 @@
-import itertools
 import os
-import shutil
-import urllib.request
-import zipfile
+from typing import Dict
 
 import numpy as np
+import pytest
 
 import xgboost
 from xgboost import testing as tm
 
-pytestmark = tm.timeout(10)
+pytestmark = tm.timeout(30)
 
 
-class TestRanking:
-    @classmethod
-    def setup_class(cls):
-        """
-        Download and setup the test fixtures
-        """
-        from sklearn.datasets import load_svmlight_files
+def comp_training_with_rank_objective(
+    dtrain: xgboost.DMatrix,
+    dtest: xgboost.DMatrix,
+    rank_objective: str,
+    metric_name: str,
+    tolerance: float = 1e-02,
+) -> None:
+    """Internal method that trains the dataset using the rank objective on GPU and CPU,
+    evaluates the metric and determines if the delta between the metric is within the
+    tolerance level.
 
-        # download the test data
-        cls.dpath = os.path.join(tm.demo_dir(__file__), "rank/")
-        src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
-        target = os.path.join(cls.dpath, "MQ2008.zip")
+    """
+    # specify validations set to watch performance
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
 
-        if os.path.exists(cls.dpath) and os.path.exists(target):
-            print("Skipping dataset download...")
-        else:
-            urllib.request.urlretrieve(url=src, filename=target)
-            with zipfile.ZipFile(target, 'r') as f:
-                f.extractall(path=cls.dpath)
+    params = {
+        "booster": "gbtree",
+        "tree_method": "gpu_hist",
+        "gpu_id": 0,
+        "predictor": "gpu_predictor",
+    }
 
-        (x_train, y_train, qid_train, x_test, y_test, qid_test,
-         x_valid, y_valid, qid_valid) = load_svmlight_files(
-            (cls.dpath + "MQ2008/Fold1/train.txt",
-             cls.dpath + "MQ2008/Fold1/test.txt",
-             cls.dpath + "MQ2008/Fold1/vali.txt"),
-            query_id=True, zero_based=False)
-        # instantiate the matrices
-        cls.dtrain = xgboost.DMatrix(x_train, y_train)
-        cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
-        cls.dtest = xgboost.DMatrix(x_test, y_test)
-        # set the group counts from the query IDs
-        cls.dtrain.set_group([len(list(items))
-                              for _key, items in itertools.groupby(qid_train)])
-        cls.dtest.set_group([len(list(items))
-                             for _key, items in itertools.groupby(qid_test)])
-        cls.dvalid.set_group([len(list(items))
-                              for _key, items in itertools.groupby(qid_valid)])
-        # save the query IDs for testing
-        cls.qid_train = qid_train
-        cls.qid_test = qid_test
-        cls.qid_valid = qid_valid
+    num_trees = 100
+    check_metric_improvement_rounds = 10
 
-        def setup_weighted(x, y, groups):
-            # Setup weighted data
-            data = xgboost.DMatrix(x, y)
-            groups_segment = [len(list(items))
-                              for _key, items in itertools.groupby(groups)]
-            data.set_group(groups_segment)
-            n_groups = len(groups_segment)
-            weights = np.ones((n_groups,))
-            data.set_weight(weights)
-            return data
+    evals_result: Dict[str, Dict] = {}
+    params["objective"] = rank_objective
+    params["eval_metric"] = metric_name
+    bst = xgboost.train(
+        params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result,
+    )
+    gpu_scores = evals_result["train"][metric_name][-1]
 
-        cls.dtrain_w = setup_weighted(x_train, y_train, qid_train)
-        cls.dtest_w = setup_weighted(x_test, y_test, qid_test)
-        cls.dvalid_w = setup_weighted(x_valid, y_valid, qid_valid)
+    evals_result = {}
 
-        # model training parameters
-        cls.params = {'booster': 'gbtree',
-                      'tree_method': 'gpu_hist',
-                      'gpu_id': 0,
-                      'predictor': 'gpu_predictor'}
-        cls.cpu_params = {'booster': 'gbtree',
-                          'tree_method': 'hist',
-                          'gpu_id': -1,
-                          'predictor': 'cpu_predictor'}
+    cpu_params = {
+        "booster": "gbtree",
+        "tree_method": "hist",
+        "gpu_id": -1,
+        "predictor": "cpu_predictor",
+    }
+    cpu_params["objective"] = rank_objective
+    cpu_params["eval_metric"] = metric_name
+    bstc = xgboost.train(
+        cpu_params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result,
+    )
+    cpu_scores = evals_result["train"][metric_name][-1]
 
-    @classmethod
-    def teardown_class(cls):
-        """
-        Cleanup test artifacts from download and unpacking
-        :return:
-        """
-        os.remove(os.path.join(cls.dpath, "MQ2008.zip"))
-        shutil.rmtree(os.path.join(cls.dpath, "MQ2008"))
+    info = (rank_objective, metric_name)
+    assert np.allclose(gpu_scores, cpu_scores, tolerance, tolerance), info
+    assert np.allclose(bst.best_score, bstc.best_score, tolerance, tolerance), info
 
-    @classmethod
-    def __test_training_with_rank_objective(cls, rank_objective, metric_name, tolerance=1e-02):
-        """
-        Internal method that trains the dataset using the rank objective on GPU and CPU, evaluates
-        the metric and determines if the delta between the metric is within the tolerance level
-        :return:
-        """
-        # specify validations set to watch performance
-        watchlist = [(cls.dtest, 'eval'), (cls.dtrain, 'train')]
+    evals_result_weighted: Dict[str, Dict] = {}
+    dtest.set_weight(np.ones((dtest.get_group().size,)))
+    dtrain.set_weight(np.ones((dtrain.get_group().size,)))
+    watchlist = [(dtest, "eval"), (dtrain, "train")]
+    bst_w = xgboost.train(
+        params,
+        dtrain,
+        num_boost_round=num_trees,
+        early_stopping_rounds=check_metric_improvement_rounds,
+        evals=watchlist,
+        evals_result=evals_result_weighted,
+    )
+    weighted_metric = evals_result_weighted["train"][metric_name][-1]
 
-        num_trees = 100
-        check_metric_improvement_rounds = 10
+    tolerance = 1e-5
+    assert np.allclose(bst_w.best_score, bst.best_score, tolerance, tolerance)
+    assert np.allclose(weighted_metric, gpu_scores, tolerance, tolerance)
 
-        evals_result = {}
-        cls.params['objective'] = rank_objective
-        cls.params['eval_metric'] = metric_name
-        bst = xgboost.train(
-            cls.params, cls.dtrain, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result)
-        gpu_map_metric = evals_result['train'][metric_name][-1]
 
-        evals_result = {}
-        cls.cpu_params['objective'] = rank_objective
-        cls.cpu_params['eval_metric'] = metric_name
-        bstc = xgboost.train(
-            cls.cpu_params, cls.dtrain, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result)
-        cpu_map_metric = evals_result['train'][metric_name][-1]
+@pytest.mark.parametrize(
+    "objective,metric",
+    [
+        ("rank:pairwise", "auc"),
+        ("rank:pairwise", "ndcg"),
+        ("rank:pairwise", "map"),
+        ("rank:ndcg", "auc"),
+        ("rank:ndcg", "ndcg"),
+        ("rank:ndcg", "map"),
+        ("rank:map", "auc"),
+        ("rank:map", "ndcg"),
+        ("rank:map", "map"),
+    ],
+)
+def test_with_mq2008(objective, metric) -> None:
+    (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    ) = tm.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))
 
-        assert np.allclose(gpu_map_metric, cpu_map_metric, tolerance,
-                           tolerance)
-        assert np.allclose(bst.best_score, bstc.best_score, tolerance,
-                           tolerance)
+    if metric.find("map") != -1 or objective.find("map") != -1:
+        y_train[y_train <= 1] = 0.0
+        y_train[y_train > 1] = 1.0
+        y_test[y_test <= 1] = 0.0
+        y_test[y_test > 1] = 1.0
 
-        evals_result_weighted = {}
-        watchlist = [(cls.dtest_w, 'eval'), (cls.dtrain_w, 'train')]
-        bst_w = xgboost.train(
-            cls.params, cls.dtrain_w, num_boost_round=num_trees,
-            early_stopping_rounds=check_metric_improvement_rounds,
-            evals=watchlist, evals_result=evals_result_weighted)
-        weighted_metric = evals_result_weighted['train'][metric_name][-1]
-        # GPU Ranking is not deterministic due to `AtomicAddGpair`,
-        # remove tolerance once the issue is resolved.
-        # https://github.com/dmlc/xgboost/issues/5561
-        assert np.allclose(bst_w.best_score, bst.best_score,
-                           tolerance, tolerance)
-        assert np.allclose(weighted_metric, gpu_map_metric,
-                           tolerance, tolerance)
+    dtrain = xgboost.DMatrix(x_train, y_train, qid=qid_train)
+    dtest = xgboost.DMatrix(x_test, y_test, qid=qid_test)
 
-    def test_training_rank_pairwise_map_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'map')
-
-    def test_training_rank_pairwise_auc_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'auc')
-
-    def test_training_rank_pairwise_ndcg_metric(self):
-        """
-        Train an XGBoost ranking model with pairwise objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:pairwise', 'ndcg')
-
-    def test_training_rank_ndcg_map(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'map')
-
-    def test_training_rank_ndcg_auc(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'auc')
-
-    def test_training_rank_ndcg_ndcg(self):
-        """
-        Train an XGBoost ranking model with ndcg objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:ndcg', 'ndcg')
-
-    def test_training_rank_map_map(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare map metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'map')
-
-    def test_training_rank_map_auc(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare auc metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'auc')
-
-    def test_training_rank_map_ndcg(self):
-        """
-        Train an XGBoost ranking model with map objective function and compare ndcg metric
-        """
-        self.__test_training_with_rank_objective('rank:map', 'ndcg')
+    comp_training_with_rank_objective(dtrain, dtest, objective, metric)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index baef690ee..c34b7d2d1 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -128,12 +128,23 @@ def test_ranking():
 
     x_test = np.random.rand(100, 10)
 
-    params = {'tree_method': 'exact', 'objective': 'rank:pairwise',
-              'learning_rate': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1,
-              'max_depth': 6, 'n_estimators': 4}
+    params = {
+        "tree_method": "exact",
+        "learning_rate": 0.1,
+        "gamma": 1.0,
+        "min_child_weight": 0.1,
+        "max_depth": 6,
+        "eval_metric": "ndcg",
+        "n_estimators": 4,
+    }
     model = xgb.sklearn.XGBRanker(**params)
-    model.fit(x_train, y_train, group=train_group,
-              eval_set=[(x_valid, y_valid)], eval_group=[valid_group])
+    model.fit(
+        x_train,
+        y_train,
+        group=train_group,
+        eval_set=[(x_valid, y_valid)],
+        eval_group=[valid_group],
+    )
     assert model.evals_result()
 
     pred = model.predict(x_test)
@@ -145,11 +156,18 @@ def test_ranking():
     assert train_data.get_label().shape[0] == x_train.shape[0]
     valid_data.set_group(valid_group)
 
-    params_orig = {'tree_method': 'exact', 'objective': 'rank:pairwise',
-                   'eta': 0.1, 'gamma': 1.0,
-                   'min_child_weight': 0.1, 'max_depth': 6}
-    xgb_model_orig = xgb.train(params_orig, train_data, num_boost_round=4,
-                               evals=[(valid_data, 'validation')])
+    params_orig = {
+        "tree_method": "exact",
+        "objective": "rank:pairwise",
+        "eta": 0.1,
+        "gamma": 1.0,
+        "min_child_weight": 0.1,
+        "max_depth": 6,
+        "eval_metric": "ndcg",
+    }
+    xgb_model_orig = xgb.train(
+        params_orig, train_data, num_boost_round=4, evals=[(valid_data, "validation")]
+    )
     pred_orig = xgb_model_orig.predict(test_data)
 
     np.testing.assert_almost_equal(pred, pred_orig)
@@ -165,7 +183,11 @@ def test_ranking_metric() -> None:
     # sklearn compares the number of mis-classified docs, while the one in xgboost
     # compares the number of mis-classified pairs.
     ltr = xgb.XGBRanker(
-        eval_metric=roc_auc_score, n_estimators=10, tree_method="hist", max_depth=2
+        eval_metric=roc_auc_score,
+        n_estimators=10,
+        tree_method="hist",
+        max_depth=2,
+        objective="rank:pairwise",
     )
     ltr.fit(
         X,

From a05799ed391a7c07bc05e70b178d735cb17ad895 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Mar 2023 19:13:44 +0800
Subject: [PATCH 25/32] Specify char type in JSON. (#8949)

char is defined as signed on x86 but unsigned on arm64

- Use `std::int8_t` instead of char.
- Fix include when clang is pretending to be gcc.
---
 include/xgboost/json_io.h | 56 +++++++++++++--------------------------
 src/common/algorithm.h    |  2 +-
 src/common/json.cc        | 25 ++++++++---------
 3 files changed, 33 insertions(+), 50 deletions(-)

diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h
index e11545b04..3a73d170a 100644
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) by Contributors 2019-2022
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  */
 #ifndef XGBOOST_JSON_IO_H_
 #define XGBOOST_JSON_IO_H_
@@ -17,44 +17,26 @@
 #include <vector>
 
 namespace xgboost {
-namespace detail {
-// Whether char is signed is undefined, as a result we might or might not need
-// static_cast and std::to_string.
-template <typename Char, std::enable_if_t<std::is_signed<Char>::value>* = nullptr>
-std::string CharToStr(Char c) {
-  static_assert(std::is_same<Char, char>::value);
-  return std::string{c};
-}
-
-template <typename Char, std::enable_if_t<!std::is_signed<Char>::value>* = nullptr>
-std::string CharToStr(Char c) {
-  static_assert(std::is_same<Char, char>::value);
-  return (c <= static_cast<char>(127) ? std::string{c} : std::to_string(c));
-}
-}  // namespace detail
-
-/*
+/**
  * \brief A json reader, currently error checking and utf-8 is not fully supported.
  */
 class JsonReader {
+ public:
+  using Char = std::int8_t;
+
  protected:
-  size_t constexpr static kMaxNumLength =
-      std::numeric_limits<double>::max_digits10 + 1;
+  size_t constexpr static kMaxNumLength = std::numeric_limits<double>::max_digits10 + 1;
 
   struct SourceLocation {
    private:
-    size_t pos_ { 0 };  // current position in raw_str_
+    std::size_t pos_{0};  // current position in raw_str_
 
    public:
     SourceLocation() = default;
-    size_t  Pos()  const { return pos_; }
+    size_t Pos() const { return pos_; }
 
-    void Forward() {
-      pos_++;
-    }
-    void Forward(uint32_t n) {
-      pos_ += n;
-    }
+    void Forward() { pos_++; }
+    void Forward(uint32_t n) { pos_ += n; }
   } cursor_;
 
   StringView raw_str_;
@@ -62,7 +44,7 @@ class JsonReader {
  protected:
   void SkipSpaces();
 
-  char GetNextChar() {
+  Char GetNextChar() {
     if (XGBOOST_EXPECT((cursor_.Pos() == raw_str_.size()), false)) {
       return -1;
     }
@@ -71,24 +53,24 @@ class JsonReader {
     return ch;
   }
 
-  char PeekNextChar() {
+  Char PeekNextChar() {
     if (cursor_.Pos() == raw_str_.size()) {
       return -1;
     }
-    char ch = raw_str_[cursor_.Pos()];
+    Char ch = raw_str_[cursor_.Pos()];
     return ch;
   }
 
   /* \brief Skip spaces and consume next character. */
-  char GetNextNonSpaceChar() {
+  Char GetNextNonSpaceChar() {
     SkipSpaces();
     return GetNextChar();
   }
   /* \brief Consume next character without first skipping empty space, throw when the next
    *        character is not the expected one.
    */
-  char GetConsecutiveChar(char expected_char) {
-    char result = GetNextChar();
+  Char GetConsecutiveChar(char expected_char) {
+    Char result = GetNextChar();
     if (XGBOOST_EXPECT(result != expected_char, false)) { Expect(expected_char, result); }
     return result;
   }
@@ -96,7 +78,7 @@ class JsonReader {
   void Error(std::string msg) const;
 
   // Report expected character
-  void Expect(char c, char got) {
+  void Expect(Char c, Char got) {
     std::string msg = "Expecting: \"";
     msg += c;
     msg += "\", got: \"";
@@ -105,7 +87,7 @@ class JsonReader {
     } else if (got == 0) {
       msg += "\\0\"";
     } else {
-      msg += detail::CharToStr(got) + " \"";
+      msg += std::to_string(got) + " \"";
     }
     Error(msg);
   }
diff --git a/src/common/algorithm.h b/src/common/algorithm.h
index 739a84968..a34010cd0 100644
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -14,7 +14,7 @@
 
 // clang with libstdc++ works as well
 #if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
-    !defined(__APPLE__) && __has_include(<omp.h>)
+    !defined(__APPLE__) && __has_include(<omp.h>) && __has_include(<parallel/algorithm>)
 #define GCC_HAS_PARALLEL 1
 #endif  // GLIC_VERSION
 
diff --git a/src/common/json.cc b/src/common/json.cc
index 8e2dd05ff..c3d61b47d 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -333,7 +333,7 @@ size_t constexpr JsonReader::kMaxNumLength;
 Json JsonReader::Parse() {
   while (true) {
     SkipSpaces();
-    char c = PeekNextChar();
+    auto c = PeekNextChar();
     if (c == -1) { break; }
 
     if (c == '{') {
@@ -408,13 +408,13 @@ void JsonReader::Error(std::string msg) const {
 }
 
 namespace {
-bool IsSpace(char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
+bool IsSpace(JsonReader::Char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
 }  // anonymous namespace
 
 // Json class
 void JsonReader::SkipSpaces() {
   while (cursor_.Pos() < raw_str_.size()) {
-    char c = raw_str_[cursor_.Pos()];
+    Char c = raw_str_[cursor_.Pos()];
     if (IsSpace(c)) {
       cursor_.Forward();
     } else {
@@ -436,12 +436,12 @@ void ParseStr(std::string const& str) {
 }
 
 Json JsonReader::ParseString() {
-  char ch { GetConsecutiveChar('\"') };  // NOLINT
+  Char ch { GetConsecutiveChar('\"') };  // NOLINT
   std::string str;
   while (true) {
     ch = GetNextChar();
     if (ch == '\\') {
-      char next = static_cast<char>(GetNextChar());
+      Char next{GetNextChar()};
       switch (next) {
         case 'r':  str += u8"\r"; break;
         case 'n':  str += u8"\n"; break;
@@ -466,8 +466,8 @@ Json JsonReader::ParseString() {
 }
 
 Json JsonReader::ParseNull() {
-  char ch = GetNextNonSpaceChar();
-  std::string buffer{ch};
+  Char ch = GetNextNonSpaceChar();
+  std::string buffer{static_cast<char>(ch)};
   for (size_t i = 0; i < 3; ++i) {
     buffer.push_back(GetNextChar());
   }
@@ -480,7 +480,7 @@ Json JsonReader::ParseNull() {
 Json JsonReader::ParseArray() {
   std::vector<Json> data;
 
-  char ch { GetConsecutiveChar('[') };  // NOLINT
+  Char ch { GetConsecutiveChar('[') };  // NOLINT
   while (true) {
     if (PeekNextChar() == ']') {
       GetConsecutiveChar(']');
@@ -503,7 +503,7 @@ Json JsonReader::ParseObject() {
 
   Object::Map data;
   SkipSpaces();
-  char ch = PeekNextChar();
+  auto ch = PeekNextChar();
 
   if (ch == '}') {
     GetConsecutiveChar('}');
@@ -652,7 +652,7 @@ Json JsonReader::ParseNumber() {
 
 Json JsonReader::ParseBoolean() {
   bool result = false;
-  char ch = GetNextNonSpaceChar();
+  Char ch = GetNextNonSpaceChar();
   std::string const t_value = u8"true";
   std::string const f_value = u8"false";
 
@@ -737,7 +737,8 @@ Json UBJReader::ParseArray() {
       case 'L':
         return ParseTypedArray<I64Array>(n);
       default:
-        LOG(FATAL) << "`" + std::string{type} + "` is not supported for typed array.";  // NOLINT
+        LOG(FATAL) << "`" + std::string{static_cast<char>(type)} +  // NOLINT
+                          "` is not supported for typed array.";
     }
   }
   std::vector<Json> results;
@@ -794,7 +795,7 @@ Json UBJReader::Load() {
 
 Json UBJReader::Parse() {
   while (true) {
-    char c = PeekNextChar();
+    auto c = PeekNextChar();
     if (c == -1) {
       break;
     }

From a551bed803d4a6d8689ae645a606802d75481e88 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Mar 2023 20:51:14 +0800
Subject: [PATCH 26/32] Remove duplicated learning rate parameter. (#8941)

---
 src/gbm/gbtree.cc | 5 +++--
 src/gbm/gbtree.h  | 7 -------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 16609619c..34915d53e 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -996,8 +996,9 @@ class Dart : public GBTree {
   }
 
   // set normalization factors
-  inline size_t NormalizeTrees(size_t size_new_trees) {
-    float lr = 1.0 * dparam_.learning_rate / size_new_trees;
+  std::size_t NormalizeTrees(size_t size_new_trees) {
+    CHECK(tree_param_.GetInitialised());
+    float lr = 1.0 * tree_param_.learning_rate / size_new_trees;
     size_t num_drop = idx_drop_.size();
     if (num_drop == 0) {
       for (size_t i = 0; i < size_new_trees; ++i) {
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 10e6c415f..eb99822f3 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -111,8 +111,6 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
   bool one_drop;
   /*! \brief probability of skipping the dropout during an iteration */
   float skip_drop;
-  /*! \brief learning step size for a time */
-  float learning_rate;
   // declare parameters
   DMLC_DECLARE_PARAMETER(DartTrainParam) {
     DMLC_DECLARE_FIELD(sample_type)
@@ -136,11 +134,6 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
         .set_range(0.0f, 1.0f)
         .set_default(0.0f)
         .describe("Probability of skipping the dropout during a boosting iteration.");
-    DMLC_DECLARE_FIELD(learning_rate)
-        .set_lower_bound(0.0f)
-        .set_default(0.3f)
-        .describe("Learning rate(step size) of update.");
-    DMLC_DECLARE_ALIAS(learning_rate, eta);
   }
 };
 

From ea04d4c46c7b28f7e8459ba07db79a18e6200cc6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Mar 2023 22:17:26 +0800
Subject: [PATCH 27/32] [doc] [dask] Troubleshooting NCCL errors. (#8943)

---
 doc/tutorials/dask.rst                        | 35 ++++++++++++-------
 src/common/device_helpers.cuh                 | 13 ++++---
 .../test_nccl_device_communicator.cu          | 16 +++++++--
 3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index ba0da9089..c66c6131f 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -190,9 +190,9 @@ Scikit-Learn wrapper object:
     booster = cls.get_booster()
 
 
-**********************
-Scikit-Learn interface
-**********************
+********************************
+Scikit-Learn Estimator Interface
+********************************
 
 As mentioned previously, there's another interface that mimics the scikit-learn estimators
 with higher level of of abstraction.  The interface is easier to use compared to the
@@ -488,12 +488,13 @@ with dask and optuna.
 Troubleshooting
 ***************
 
-.. versionadded:: 1.6.0
 
-In some environments XGBoost might fail to resolve the IP address of the scheduler, a
-symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
-during training.  A quick workaround is to specify the address explicitly.  To do that
-dask config is used:
+- In some environments XGBoost might fail to resolve the IP address of the scheduler, a
+  symptom is user receiving ``OSError: [Errno 99] Cannot assign requested address`` error
+  during training.  A quick workaround is to specify the address explicitly.  To do that
+  dask config is used:
+
+  .. versionadded:: 1.6.0
 
 .. code-block:: python
 
@@ -511,10 +512,20 @@ dask config is used:
         reg = dxgb.DaskXGBRegressor()
 
 
-Please note that XGBoost requires a different port than dask. By default, on a unix-like
-system XGBoost uses the port 0 to find available ports, which may fail if a user is
-running in a restricted docker environment. In this case, please open additional ports in
-the container and specify it as in the above snippet.
+- Please note that XGBoost requires a different port than dask. By default, on a unix-like
+  system XGBoost uses the port 0 to find available ports, which may fail if a user is
+  running in a restricted docker environment. In this case, please open additional ports
+  in the container and specify it as in the above snippet.
+
+- If you encounter a NCCL system error while training with GPU enabled, which usually
+  includes the error message `NCCL failure: unhandled system error`, you can specify its
+  network configuration using one of the environment variables listed in the `NCCL
+  document <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ such as
+  the ``NCCL_SOCKET_IFNAME``. In addition, you can use ``NCCL_DEBUG`` to obtain debug
+  logs.
+
+- MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
+  that includes `Multiple processes within a communication group ...` upon initialization.
 
 ************
 IPv6 Support
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 58300d06c..f048aed43 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -118,17 +118,20 @@ namespace dh {
 #ifdef XGBOOST_USE_NCCL
 #define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
 
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
-                                     int line) {
+inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
   if (code != ncclSuccess) {
     std::stringstream ss;
-    ss << "NCCL failure :" << ncclGetErrorString(code);
+    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
+    ss << " " << file << "(" << line << ")\n";
     if (code == ncclUnhandledCudaError) {
       // nccl usually preserves the last error so we can get more details.
       auto err = cudaPeekAtLastError();
-      ss << " " << thrust::system_error(err, thrust::cuda_category()).what();
+      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+    } else if (code == ncclSystemError) {
+      ss << "  This might be caused by a network configuration issue. Please consider specifying "
+            "the network interface for NCCL via environment variables listed in its reference: "
+            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
     }
-    ss << " " << file << "(" << line << ")";
     LOG(FATAL) << ss.str();
   }
 
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index 47de054c6..8ce877aef 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -1,10 +1,12 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023, XGBoost contributors
  */
 #ifdef XGBOOST_USE_NCCL
 
 #include <gtest/gtest.h>
 
+#include <string>  // for string
+
 #include "../../../src/collective/nccl_device_communicator.cuh"
 
 namespace xgboost {
@@ -20,7 +22,15 @@ TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
+TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
+  try {
+    dh::safe_nccl(ncclSystemError);
+  } catch (dmlc::Error const& e) {
+    auto str = std::string{e.what()};
+    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
+  }
+}
 }  // namespace collective
 }  // namespace xgboost
 
-#endif
+#endif  // XGBOOST_USE_NCCL

From 151882dd2624a00aa2e402370f0a9def2b336304 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Mar 2023 23:49:56 +0800
Subject: [PATCH 28/32] Initial support for multi-target tree. (#8616)

* Implement multi-target for hist.

- Add new hist tree builder.
- Move data fetchers for tests.
- Dispatch function calls in gbm base on the tree type.
---
 demo/guide-python/multioutput_regression.py   |  36 ++-
 doc/parameter.rst                             |  12 +
 doc/tutorials/multioutput.rst                 |  29 +-
 include/xgboost/learner.h                     |   8 +-
 include/xgboost/linalg.h                      |   8 +-
 python-package/xgboost/sklearn.py             |  15 +
 python-package/xgboost/testing/__init__.py    | 245 +++++-----------
 python-package/xgboost/testing/data.py        | 147 +++++++++-
 python-package/xgboost/testing/params.py      |  24 +-
 src/c_api/c_api_utils.h                       |   1 +
 src/common/quantile.cc                        |   5 +-
 src/common/quantile.h                         |  13 -
 src/data/iterative_dmatrix.cc                 |   1 +
 src/gbm/gbtree.cc                             |  74 +++--
 src/gbm/gbtree.h                              |  35 ++-
 src/learner.cc                                |  12 +-
 src/metric/rank_metric.cu                     |   2 -
 src/predictor/cpu_predictor.cc                |  95 +++----
 src/predictor/gpu_predictor.cu                |   2 +-
 src/tree/hist/histogram.h                     |   4 +-
 src/tree/tree_model.cc                        |   2 +
 src/tree/updater_quantile_hist.cc             | 268 ++++++++++++++++--
 tests/ci_build/lint_python.py                 |  25 +-
 tests/cpp/gbm/test_gbtree.cc                  |   2 +-
 tests/cpp/helpers.h                           |   2 +-
 tests/cpp/predictor/test_predictor.cc         |   2 +-
 tests/cpp/test_multi_target.cc                |   4 +-
 tests/python-gpu/test_gpu_ranking.py          |   2 +-
 tests/python-gpu/test_gpu_updaters.py         |  15 +-
 tests/python/test_basic_models.py             |  39 ++-
 tests/python/test_callback.py                 |   2 +-
 tests/python/test_ranking.py                  |   2 +-
 tests/python/test_updaters.py                 | 110 +++++--
 .../test_with_dask/test_with_dask.py          |   2 +-
 34 files changed, 856 insertions(+), 389 deletions(-)

diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py
index 375377e4e..078ec6b7d 100644
--- a/demo/guide-python/multioutput_regression.py
+++ b/demo/guide-python/multioutput_regression.py
@@ -7,6 +7,12 @@ The demo is adopted from scikit-learn:
 https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py
 
 See :doc:`/tutorials/multioutput` for more information.
+
+.. note::
+
+    The feature is experimental. For the `multi_output_tree` strategy, many features are
+    missing.
+
 """
 
 import argparse
@@ -40,11 +46,18 @@ def gen_circle() -> Tuple[np.ndarray, np.ndarray]:
     return X, y
 
 
-def rmse_model(plot_result: bool):
+def rmse_model(plot_result: bool, strategy: str):
     """Draw a circle with 2-dim coordinate as target variables."""
     X, y = gen_circle()
     # Train a regressor on it
-    reg = xgb.XGBRegressor(tree_method="hist", n_estimators=64)
+    reg = xgb.XGBRegressor(
+        tree_method="hist",
+        n_estimators=128,
+        n_jobs=16,
+        max_depth=8,
+        multi_strategy=strategy,
+        subsample=0.6,
+    )
     reg.fit(X, y, eval_set=[(X, y)])
 
     y_predt = reg.predict(X)
@@ -52,7 +65,7 @@ def rmse_model(plot_result: bool):
         plot_predt(y, y_predt, "multi")
 
 
-def custom_rmse_model(plot_result: bool) -> None:
+def custom_rmse_model(plot_result: bool, strategy: str) -> None:
     """Train using Python implementation of Squared Error."""
 
     # As the experimental support status, custom objective doesn't support matrix as
@@ -88,9 +101,10 @@ def custom_rmse_model(plot_result: bool) -> None:
         {
             "tree_method": "hist",
             "num_target": y.shape[1],
+            "multi_strategy": strategy,
         },
         dtrain=Xy,
-        num_boost_round=100,
+        num_boost_round=128,
         obj=squared_log,
         evals=[(Xy, "Train")],
         evals_result=results,
@@ -107,6 +121,16 @@ if __name__ == "__main__":
     parser.add_argument("--plot", choices=[0, 1], type=int, default=1)
     args = parser.parse_args()
     # Train with builtin RMSE objective
-    rmse_model(args.plot == 1)
+    # - One model per output.
+    rmse_model(args.plot == 1, "one_output_per_tree")
+
+    # - One model for all outputs, this is still working in progress, many features are
+    # missing.
+    rmse_model(args.plot == 1, "multi_output_tree")
+
     # Train with custom objective.
-    custom_rmse_model(args.plot == 1)
+    # - One model per output.
+    custom_rmse_model(args.plot == 1, "one_output_per_tree")
+    # - One model for all outputs, this is still working in progress, many features are
+    # missing.
+    custom_rmse_model(args.plot == 1, "multi_output_tree")
diff --git a/doc/parameter.rst b/doc/parameter.rst
index ac566af74..1e703dacd 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -226,6 +226,18 @@ Parameters for Tree Booster
     list is a group of indices of features that are allowed to interact with each other.
     See :doc:`/tutorials/feature_interaction_constraint` for more information.
 
+* ``multi_strategy``, [default = ``one_output_per_tree``]
+
+  .. versionadded:: 2.0.0
+
+  .. note:: This parameter is working-in-progress.
+
+  - The strategy used for training multi-target models, including multi-target regression
+  and multi-class classification. See :doc:`/tutorials/multioutput` for more information.
+
+    - ``one_output_per_tree``: One model for each target.
+    - ``multi_output_tree``:  Use multi-target trees.
+
 .. _cat-param:
 
 Parameters for Categorical Feature
diff --git a/doc/tutorials/multioutput.rst b/doc/tutorials/multioutput.rst
index 280fb106f..983002aed 100644
--- a/doc/tutorials/multioutput.rst
+++ b/doc/tutorials/multioutput.rst
@@ -11,7 +11,11 @@ can be simultaneously classified as both sci-fi and comedy.  For detailed explan
 terminologies related to different multi-output models please refer to the
 :doc:`scikit-learn user guide <sklearn:modules/multiclass>`.
 
-Internally, XGBoost builds one model for each target similar to sklearn meta estimators,
+**********************************
+Training with One-Model-Per-Target
+**********************************
+
+By default, XGBoost builds one model for each target similar to sklearn meta estimators,
 with the added benefit of reusing data and other integrated features like SHAP.  For a
 worked example of regression, see
 :ref:`sphx_glr_python_examples_multioutput_regression.py`. For multi-label classification,
@@ -36,3 +40,26 @@ dense matrix for labels.
 
 
 The feature is still under development with limited support from objectives and metrics.
+
+*************************
+Training with Vector Leaf
+*************************
+
+.. versionadded:: 2.0
+
+.. note::
+
+   This is still working-in-progress, and many features are missing.
+
+XGBoost can optionally build multi-output trees with the size of leaf equals to the number
+of targets when the tree method `hist` is used. The behavior can be controlled by the
+``multi_strategy`` training parameter, which can take the value `one_output_per_tree` (the
+default) for building one model per-target or `multi_output_tree` for building
+multi-output trees.
+
+.. code-block:: python
+
+  clf = xgb.XGBClassifier(tree_method="hist", multi_strategy="multi_output_tree")
+
+See :ref:`sphx_glr_python_examples_multioutput_regression.py` for a worked example with
+regression.
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 1d4e35a94..08e1ded09 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -286,8 +286,8 @@ struct LearnerModelParamLegacy;
  * \brief Strategy for building multi-target models.
  */
 enum class MultiStrategy : std::int32_t {
-  kComposite = 0,
-  kMonolithic = 1,
+  kOneOutputPerTree = 0,
+  kMultiOutputTree = 1,
 };
 
 /**
@@ -317,7 +317,7 @@ struct LearnerModelParam {
   /**
    * \brief Strategy for building multi-target models.
    */
-  MultiStrategy multi_strategy{MultiStrategy::kComposite};
+  MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
 
   LearnerModelParam() = default;
   // As the old `LearnerModelParamLegacy` is still used by binary IO, we keep
@@ -338,7 +338,7 @@ struct LearnerModelParam {
 
   void Copy(LearnerModelParam const& that);
   [[nodiscard]] bool IsVectorLeaf() const noexcept {
-    return multi_strategy == MultiStrategy::kMonolithic;
+    return multi_strategy == MultiStrategy::kMultiOutputTree;
   }
   [[nodiscard]] bst_target_t OutputLength() const noexcept { return this->num_output_group; }
   [[nodiscard]] bst_target_t LeafLength() const noexcept {
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 3d6bcc962..65e9de6ba 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -530,17 +530,17 @@ class TensorView {
   /**
    * \brief Number of items in the tensor.
    */
-  LINALG_HD [[nodiscard]] std::size_t Size() const { return size_; }
+  [[nodiscard]] LINALG_HD std::size_t Size() const { return size_; }
   /**
    * \brief Whether this is a contiguous array, both C and F contiguous returns true.
    */
-  LINALG_HD [[nodiscard]] bool Contiguous() const {
+  [[nodiscard]] LINALG_HD bool Contiguous() const {
     return data_.size() == this->Size() || this->CContiguous() || this->FContiguous();
   }
   /**
    * \brief Whether it's a c-contiguous array.
    */
-  LINALG_HD [[nodiscard]] bool CContiguous() const {
+  [[nodiscard]] LINALG_HD bool CContiguous() const {
     StrideT stride;
     static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
     // It's contiguous if the stride can be calculated from shape.
@@ -550,7 +550,7 @@ class TensorView {
   /**
    * \brief Whether it's a f-contiguous array.
    */
-  LINALG_HD [[nodiscard]] bool FContiguous() const {
+  [[nodiscard]] LINALG_HD bool FContiguous() const {
     StrideT stride;
     static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
     // It's contiguous if the stride can be calculated from shape.
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 3204f5a2a..805eb75b3 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -312,6 +312,19 @@ __model_doc = f"""
         needs to be set to have categorical feature support. See :doc:`Categorical Data
         </tutorials/categorical>` and :ref:`cat-param` for details.
 
+    multi_strategy : Optional[str]
+
+        .. versionadded:: 2.0.0
+
+        .. note:: This parameter is working-in-progress.
+
+        The strategy used for training multi-target models, including multi-target
+        regression and multi-class classification. See :doc:`/tutorials/multioutput` for
+        more information.
+
+        - ``one_output_per_tree``: One model for each target.
+        - ``multi_output_tree``:  Use multi-target trees.
+
     eval_metric : Optional[Union[str, List[str], Callable]]
 
         .. versionadded:: 1.6.0
@@ -624,6 +637,7 @@ class XGBModel(XGBModelBase):
         feature_types: Optional[FeatureTypes] = None,
         max_cat_to_onehot: Optional[int] = None,
         max_cat_threshold: Optional[int] = None,
+        multi_strategy: Optional[str] = None,
         eval_metric: Optional[Union[str, List[str], Callable]] = None,
         early_stopping_rounds: Optional[int] = None,
         callbacks: Optional[List[TrainingCallback]] = None,
@@ -670,6 +684,7 @@ class XGBModel(XGBModelBase):
         self.feature_types = feature_types
         self.max_cat_to_onehot = max_cat_to_onehot
         self.max_cat_threshold = max_cat_threshold
+        self.multi_strategy = multi_strategy
         self.eval_metric = eval_metric
         self.early_stopping_rounds = early_stopping_rounds
         self.callbacks = callbacks
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index bb13b5523..20a4c681e 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -10,11 +10,9 @@ import os
 import platform
 import socket
 import sys
-import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
-from pathlib import Path
 from platform import system
 from typing import (
     Any,
@@ -29,7 +27,6 @@ from typing import (
     TypedDict,
     Union,
 )
-from urllib import request
 
 import numpy as np
 import pytest
@@ -38,6 +35,13 @@ from scipy import sparse
 import xgboost as xgb
 from xgboost.core import ArrayLike
 from xgboost.sklearn import SklObjective
+from xgboost.testing.data import (
+    get_california_housing,
+    get_cancer,
+    get_digits,
+    get_sparse,
+    memory,
+)
 
 hypothesis = pytest.importorskip("hypothesis")
 
@@ -45,13 +49,8 @@ hypothesis = pytest.importorskip("hypothesis")
 from hypothesis import strategies
 from hypothesis.extra.numpy import arrays
 
-joblib = pytest.importorskip("joblib")
 datasets = pytest.importorskip("sklearn.datasets")
 
-Memory = joblib.Memory
-
-memory = Memory("./cachedir", verbose=0)
-
 PytestSkip = TypedDict("PytestSkip", {"condition": bool, "reason": str})
 
 
@@ -353,137 +352,6 @@ class TestDataset:
         return self.name
 
 
-@memory.cache
-def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
-    data = datasets.fetch_california_housing()
-    return data.data, data.target
-
-
-@memory.cache
-def get_digits() -> Tuple[np.ndarray, np.ndarray]:
-    data = datasets.load_digits()
-    return data.data, data.target
-
-
-@memory.cache
-def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
-    return datasets.load_breast_cancer(return_X_y=True)
-
-
-@memory.cache
-def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
-    rng = np.random.RandomState(199)
-    n = 2000
-    sparsity = 0.75
-    X, y = datasets.make_regression(n, random_state=rng)
-    flag = rng.binomial(1, sparsity, X.shape)
-    for i in range(X.shape[0]):
-        for j in range(X.shape[1]):
-            if flag[i, j]:
-                X[i, j] = np.nan
-    return X, y
-
-
-@memory.cache
-def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Number of samples: 1460
-    Number of features: 20
-    Number of categorical features: 10
-    Number of numerical features: 10
-    """
-    from sklearn.datasets import fetch_openml
-
-    X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
-
-    categorical_columns_subset: List[str] = [
-        "BldgType",  # 5 cats, no nan
-        "GarageFinish",  # 3 cats, nan
-        "LotConfig",  # 5 cats, no nan
-        "Functional",  # 7 cats, no nan
-        "MasVnrType",  # 4 cats, nan
-        "HouseStyle",  # 8 cats, no nan
-        "FireplaceQu",  # 5 cats, nan
-        "ExterCond",  # 5 cats, no nan
-        "ExterQual",  # 4 cats, no nan
-        "PoolQC",  # 3 cats, nan
-    ]
-
-    numerical_columns_subset: List[str] = [
-        "3SsnPorch",
-        "Fireplaces",
-        "BsmtHalfBath",
-        "HalfBath",
-        "GarageCars",
-        "TotRmsAbvGrd",
-        "BsmtFinSF1",
-        "BsmtFinSF2",
-        "GrLivArea",
-        "ScreenPorch",
-    ]
-
-    X = X[categorical_columns_subset + numerical_columns_subset]
-    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
-    return X, y
-
-
-@memory.cache
-def get_mq2008(
-    dpath: str,
-) -> Tuple[
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-    sparse.csr_matrix,
-    np.ndarray,
-    np.ndarray,
-]:
-    from sklearn.datasets import load_svmlight_files
-
-    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
-    target = os.path.join(os.path.expanduser(dpath), "MQ2008.zip")
-    if not os.path.exists(target):
-        request.urlretrieve(url=src, filename=target)
-
-    with zipfile.ZipFile(target, "r") as f:
-        f.extractall(path=dpath)
-
-    (
-        x_train,
-        y_train,
-        qid_train,
-        x_test,
-        y_test,
-        qid_test,
-        x_valid,
-        y_valid,
-        qid_valid,
-    ) = load_svmlight_files(
-        (
-            Path(dpath) / "MQ2008" / "Fold1" / "train.txt",
-            Path(dpath) / "MQ2008" / "Fold1" / "test.txt",
-            Path(dpath) / "MQ2008" / "Fold1" / "vali.txt",
-        ),
-        query_id=True,
-        zero_based=False,
-    )
-
-    return (
-        x_train,
-        y_train,
-        qid_train,
-        x_test,
-        y_test,
-        qid_test,
-        x_valid,
-        y_valid,
-        qid_valid,
-    )
-
-
 # pylint: disable=too-many-arguments,too-many-locals
 @memory.cache
 def make_categorical(
@@ -738,20 +606,7 @@ _unweighted_datasets_strategy = strategies.sampled_from(
         TestDataset(
             "calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
         ),
-        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
         TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
-        TestDataset(
-            "mtreg",
-            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
-            "reg:squarederror",
-            "rmse",
-        ),
-        TestDataset(
-            "mtreg-l1",
-            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
-            "reg:absoluteerror",
-            "mae",
-        ),
         TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
         TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
         TestDataset(
@@ -764,37 +619,71 @@ _unweighted_datasets_strategy = strategies.sampled_from(
 )
 
 
-@strategies.composite
-def _dataset_weight_margin(draw: Callable) -> TestDataset:
-    data: TestDataset = draw(_unweighted_datasets_strategy)
-    if draw(strategies.booleans()):
-        data.w = draw(
-            arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
-        )
-    if draw(strategies.booleans()):
-        num_class = 1
-        if data.objective == "multi:softmax":
-            num_class = int(np.max(data.y) + 1)
-        elif data.name.startswith("mtreg"):
-            num_class = data.y.shape[1]
+def make_datasets_with_margin(
+    unweighted_strategy: strategies.SearchStrategy,
+) -> Callable:
+    """Factory function for creating strategies that generates datasets with weight and
+    base margin.
 
-        data.margin = draw(
-            arrays(
-                np.float64,
-                (data.y.shape[0] * num_class),
-                elements=strategies.floats(0.5, 1.0),
+    """
+
+    @strategies.composite
+    def weight_margin(draw: Callable) -> TestDataset:
+        data: TestDataset = draw(unweighted_strategy)
+        if draw(strategies.booleans()):
+            data.w = draw(
+                arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
             )
-        )
-        assert data.margin is not None
-        if num_class != 1:
-            data.margin = data.margin.reshape(data.y.shape[0], num_class)
+        if draw(strategies.booleans()):
+            num_class = 1
+            if data.objective == "multi:softmax":
+                num_class = int(np.max(data.y) + 1)
+            elif data.name.startswith("mtreg"):
+                num_class = data.y.shape[1]
 
-    return data
+            data.margin = draw(
+                arrays(
+                    np.float64,
+                    (data.y.shape[0] * num_class),
+                    elements=strategies.floats(0.5, 1.0),
+                )
+            )
+            assert data.margin is not None
+            if num_class != 1:
+                data.margin = data.margin.reshape(data.y.shape[0], num_class)
+
+        return data
+
+    return weight_margin
 
 
-# A strategy for drawing from a set of example datasets
-# May add random weights to the dataset
-dataset_strategy = _dataset_weight_margin()
+# A strategy for drawing from a set of example datasets. May add random weights to the
+# dataset
+dataset_strategy = make_datasets_with_margin(_unweighted_datasets_strategy)()
+
+
+_unweighted_multi_datasets_strategy = strategies.sampled_from(
+    [
+        TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
+        TestDataset(
+            "mtreg",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:squarederror",
+            "rmse",
+        ),
+        TestDataset(
+            "mtreg-l1",
+            lambda: datasets.make_regression(n_samples=128, n_features=2, n_targets=3),
+            "reg:absoluteerror",
+            "mae",
+        ),
+    ]
+)
+
+# A strategy for drawing from a set of multi-target/multi-class datasets.
+multi_dataset_strategy = make_datasets_with_margin(
+    _unweighted_multi_datasets_strategy
+)()
 
 
 def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index a9ea0019c..477d0cf3d 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -1,13 +1,20 @@
 """Utilities for data generation."""
-from typing import Any, Generator, Tuple, Union
+import os
+import zipfile
+from typing import Any, Generator, List, Tuple, Union
+from urllib import request
 
 import numpy as np
 import pytest
 from numpy.random import Generator as RNG
+from scipy import sparse
 
 import xgboost
 from xgboost.data import pandas_pyarrow_mapper
 
+joblib = pytest.importorskip("joblib")
+memory = joblib.Memory("./cachedir", verbose=0)
+
 
 def np_dtypes(
     n_samples: int, n_features: int
@@ -195,3 +202,141 @@ def check_inf(rng: RNG) -> None:
 
     with pytest.raises(ValueError, match="Input data contains `inf`"):
         xgboost.DMatrix(X, y)
+
+
+@memory.cache
+def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the California housing dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    data = datasets.fetch_california_housing()
+    return data.data, data.target
+
+
+@memory.cache
+def get_digits() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the digits dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    data = datasets.load_digits()
+    return data.data, data.target
+
+
+@memory.cache
+def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
+    """Fetch the breast cancer dataset from sklearn."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    return datasets.load_breast_cancer(return_X_y=True)
+
+
+@memory.cache
+def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
+    """Generate a sparse dataset."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    rng = np.random.RandomState(199)
+    n = 2000
+    sparsity = 0.75
+    X, y = datasets.make_regression(n, random_state=rng)
+    flag = rng.binomial(1, sparsity, X.shape)
+    for i in range(X.shape[0]):
+        for j in range(X.shape[1]):
+            if flag[i, j]:
+                X[i, j] = np.nan
+    return X, y
+
+
+@memory.cache
+def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Number of samples: 1460
+    Number of features: 20
+    Number of categorical features: 10
+    Number of numerical features: 10
+    """
+    datasets = pytest.importorskip("sklearn.datasets")
+    X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+
+    categorical_columns_subset: List[str] = [
+        "BldgType",  # 5 cats, no nan
+        "GarageFinish",  # 3 cats, nan
+        "LotConfig",  # 5 cats, no nan
+        "Functional",  # 7 cats, no nan
+        "MasVnrType",  # 4 cats, nan
+        "HouseStyle",  # 8 cats, no nan
+        "FireplaceQu",  # 5 cats, nan
+        "ExterCond",  # 5 cats, no nan
+        "ExterQual",  # 4 cats, no nan
+        "PoolQC",  # 3 cats, nan
+    ]
+
+    numerical_columns_subset: List[str] = [
+        "3SsnPorch",
+        "Fireplaces",
+        "BsmtHalfBath",
+        "HalfBath",
+        "GarageCars",
+        "TotRmsAbvGrd",
+        "BsmtFinSF1",
+        "BsmtFinSF2",
+        "GrLivArea",
+        "ScreenPorch",
+    ]
+
+    X = X[categorical_columns_subset + numerical_columns_subset]
+    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
+    return X, y
+
+
+@memory.cache
+def get_mq2008(
+    dpath: str,
+) -> Tuple[
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+    sparse.csr_matrix,
+    np.ndarray,
+    np.ndarray,
+]:
+    """Fetch the mq2008 dataset."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
+    target = os.path.join(dpath, "MQ2008.zip")
+    if not os.path.exists(target):
+        request.urlretrieve(url=src, filename=target)
+
+    with zipfile.ZipFile(target, "r") as f:
+        f.extractall(path=dpath)
+
+    (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    ) = datasets.load_svmlight_files(
+        (
+            os.path.join(dpath, "MQ2008/Fold1/train.txt"),
+            os.path.join(dpath, "MQ2008/Fold1/test.txt"),
+            os.path.join(dpath, "MQ2008/Fold1/vali.txt"),
+        ),
+        query_id=True,
+        zero_based=False,
+    )
+
+    return (
+        x_train,
+        y_train,
+        qid_train,
+        x_test,
+        y_test,
+        qid_test,
+        x_valid,
+        y_valid,
+        qid_valid,
+    )
diff --git a/python-package/xgboost/testing/params.py b/python-package/xgboost/testing/params.py
index 3af3306da..e6ba73e1f 100644
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@@ -4,8 +4,8 @@ from typing import cast
 
 import pytest
 
-hypothesis = pytest.importorskip("hypothesis")
-from hypothesis import strategies  # pylint:disable=wrong-import-position
+strategies = pytest.importorskip("hypothesis.strategies")
+
 
 exact_parameter_strategy = strategies.fixed_dictionaries(
     {
@@ -41,6 +41,26 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
     and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )
 
+hist_multi_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 11),
+        "max_leaves": strategies.integers(0, 1024),
+        "max_bin": strategies.integers(2, 512),
+        "multi_strategy": strategies.sampled_from(
+            ["multi_output_tree", "one_output_per_tree"]
+        ),
+        "grow_policy": strategies.sampled_from(["lossguide", "depthwise"]),
+        "min_child_weight": strategies.floats(0.5, 2.0),
+        # We cannot enable subsampling as the training loss can increase
+        # 'subsample': strategies.floats(0.5, 1.0),
+        "colsample_bytree": strategies.floats(0.5, 1.0),
+        "colsample_bylevel": strategies.floats(0.5, 1.0),
+    }
+).filter(
+    lambda x: (cast(int, x["max_depth"]) > 0 or cast(int, x["max_leaves"]) > 0)
+    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
+)
+
 cat_parameter_strategy = strategies.fixed_dictionaries(
     {
         "max_cat_to_onehot": strategies.integers(1, 128),
diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h
index 8908364f2..1af0206be 100644
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -55,6 +55,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
       *out_dim = 2;
       shape.resize(*out_dim);
       shape.front() = rows;
+      // chunksize can be 1 if it's softmax
       shape.back() = std::min(groups, chunksize);
     }
     break;
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 87eb0ec20..aaf271934 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -359,6 +359,7 @@ void AddCutPoint(typename SketchType::SummaryContainer const &summary, int max_b
                  HistogramCuts *cuts) {
   size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
   auto &cut_values = cuts->cut_values_.HostVector();
+  // we use the min_value as the first (0th) element, hence starting from 1.
   for (size_t i = 1; i < required_cuts; ++i) {
     bst_float cpt = summary.data[i].value;
     if (i == 1 || cpt > cut_values.back()) {
@@ -419,8 +420,8 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
     } else {
       AddCutPoint<WQSketch>(a, max_num_bins, cuts);
       // push a value that is greater than anything
-      const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
-                                         : cuts->min_vals_.HostVector()[fid];
+      const bst_float cpt =
+          (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
       // this must be bigger than last value in a scale
       const bst_float last = cpt + (fabs(cpt) + 1e-5f);
       cuts->cut_values_.HostVector().push_back(last);
diff --git a/src/common/quantile.h b/src/common/quantile.h
index c8dcf6ada..a19b4bbb0 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -352,19 +352,6 @@ struct WQSummary {
       prev_rmax = data[i].rmax;
     }
   }
-  // check consistency of the summary
-  inline bool Check(const char *msg) const {
-    const float tol = 10.0f;
-    for (size_t i = 0; i < this->size; ++i) {
-      if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
-          data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
-        LOG(INFO) << "---------- WQSummary::Check did not pass ----------";
-        this->Print();
-        return false;
-      }
-    }
-    return true;
-  }
 };
 
 /*! \brief try to do efficient pruning */
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index c7ac492c9..dc6fb55e8 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -257,6 +257,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   }
   iter.Reset();
   CHECK_EQ(rbegin, Info().num_row_);
+  CHECK_EQ(this->ghist_->Features(), Info().num_col_);
 
   /**
    * Generate column matrix
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 34915d53e..a912d6a75 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -10,6 +10,7 @@
 #include <dmlc/parameter.h>
 
 #include <algorithm>
+#include <cinttypes>  // for uint32_t
 #include <limits>
 #include <memory>
 #include <string>
@@ -27,9 +28,11 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
 #include "xgboost/logging.h"
+#include "xgboost/model.h"
 #include "xgboost/objective.h"
 #include "xgboost/predictor.h"
-#include "xgboost/string_view.h"
+#include "xgboost/string_view.h"  // for StringView
+#include "xgboost/tree_model.h"   // for RegTree
 #include "xgboost/tree_updater.h"
 
 namespace xgboost::gbm {
@@ -131,6 +134,12 @@ void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
     // set, since only experts are expected to do so.
     return;
   }
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+  }
+
   // tparam_ is set before calling this function.
   if (tparam_.tree_method != TreeMethod::kAuto) {
     return;
@@ -175,12 +184,12 @@ void GBTree::ConfigureUpdaters() {
     case TreeMethod::kExact:
       tparam_.updater_seq = "grow_colmaker,prune";
       break;
-    case TreeMethod::kHist:
-      LOG(INFO) <<
-          "Tree method is selected to be 'hist', which uses a "
-          "single updater grow_quantile_histmaker.";
+    case TreeMethod::kHist: {
+      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
+                   "grow_quantile_histmaker.";
       tparam_.updater_seq = "grow_quantile_histmaker";
       break;
+    }
     case TreeMethod::kGPUHist: {
       common::AssertGPUSupport();
       tparam_.updater_seq = "grow_gpu_hist";
@@ -209,11 +218,9 @@ void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_thre
     GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
   } else {
     std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
-    auto nsize = static_cast<bst_omp_uint>(out_gpair->Size());
-    const auto &gpair_h = in_gpair->ConstHostVector();
-    common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
-      tmp_h[i] = gpair_h[i * n_groups + group_id];
-    });
+    const auto& gpair_h = in_gpair->ConstHostVector();
+    common::ParallelFor(out_gpair->Size(), n_threads,
+                        [&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
   }
 }
 
@@ -234,6 +241,7 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
   CHECK_EQ(model_.param.num_parallel_tree, trees.size());
   CHECK_EQ(model_.param.num_parallel_tree, 1)
       << "Boosting random forest is not supported for current objective.";
+  CHECK(!trees.front()->IsMultiTarget()) << "Update tree leaf" << MTNotImplemented();
   CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
   for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
     auto const& position = node_position.at(tree_idx);
@@ -245,17 +253,18 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
 void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                      PredictionCacheEntry* predt, ObjFunction const* obj) {
   std::vector<std::vector<std::unique_ptr<RegTree>>> new_trees;
-  const int ngroup = model_.learner_model_param->num_output_group;
+  const int ngroup = model_.learner_model_param->OutputLength();
   ConfigureWithKnownData(this->cfg_, p_fmat);
   monitor_.Start("BoostNewTrees");
+
   // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
   // `gpu_id` be the single source of determining what algorithms to run, but that will
   // break a lots of existing code.
   auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
-  auto out = linalg::TensorView<float, 2>{
+  auto out = linalg::MakeTensorView(
+      device,
       device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
-      {static_cast<size_t>(p_fmat->Info().num_row_), static_cast<size_t>(ngroup)},
-      device};
+      p_fmat->Info().num_row_, model_.learner_model_param->OutputLength());
   CHECK_NE(ngroup, 0);
 
   if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
@@ -266,7 +275,13 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
   // position is negated if the row is sampled out.
   std::vector<HostDeviceVector<bst_node_t>> node_position;
 
-  if (ngroup == 1) {
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    std::vector<std::unique_ptr<RegTree>> ret;
+    BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
+    UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
+    // No update prediction cache yet.
+    new_trees.push_back(std::move(ret));
+  } else if (model_.learner_model_param->OutputLength() == 1) {
     std::vector<std::unique_ptr<RegTree>> ret;
     BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
     UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@@ -383,11 +398,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
   }
 
   // update the trees
-  CHECK_EQ(gpair->Size(), p_fmat->Info().num_row_)
-      << "Mismatching size between number of rows from input data and size of "
-         "gradient vector.";
+  auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
+  StringView msg{
+      "Mismatching size between number of rows from input data and size of gradient vector."};
+  if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
+    CHECK_EQ(n_out % gpair->Size(), 0) << msg;
+  } else {
+    CHECK_EQ(gpair->Size(), n_out) << msg;
+  }
 
-  CHECK(out_position);
   out_position->resize(new_trees.size());
 
   // Rescale learning rate according to the size of trees
@@ -402,8 +421,12 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
 
 void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
   monitor_.Start("CommitModel");
-  for (uint32_t gid = 0; gid < model_.learner_model_param->num_output_group; ++gid) {
-    model_.CommitModel(std::move(new_trees[gid]), gid);
+  if (this->model_.learner_model_param->IsVectorLeaf()) {
+    model_.CommitModel(std::move(new_trees[0]), 0);
+  } else {
+    for (std::uint32_t gid = 0; gid < model_.learner_model_param->OutputLength(); ++gid) {
+      model_.CommitModel(std::move(new_trees[gid]), gid);
+    }
   }
   monitor_.Stop("CommitModel");
 }
@@ -564,11 +587,10 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
   if (out_preds->version == 0) {
     // out_preds->Size() can be non-zero as it's initialized here before any
     // tree is built at the 0^th iterator.
-    predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions,
-                                  model_);
+    predictor->InitOutPredictions(p_fmat->Info(), &out_preds->predictions, model_);
   }
 
-  uint32_t tree_begin, tree_end;
+  std::uint32_t tree_begin, tree_end;
   std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
   CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
   if (tree_end > tree_begin) {
@@ -577,7 +599,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat,
   if (reset) {
     out_preds->version = 0;
   } else {
-    uint32_t delta = layer_end - out_preds->version;
+    std::uint32_t delta = layer_end - out_preds->version;
     out_preds->Update(delta);
   }
 }
@@ -770,6 +792,7 @@ class Dart : public GBTree {
   void PredictBatchImpl(DMatrix *p_fmat, PredictionCacheEntry *p_out_preds,
                         bool training, unsigned layer_begin,
                         unsigned layer_end) const {
+    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
     auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
     CHECK(predictor);
     predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
@@ -830,6 +853,7 @@ class Dart : public GBTree {
   void InplacePredict(std::shared_ptr<DMatrix> p_fmat, float missing,
                       PredictionCacheEntry* p_out_preds, uint32_t layer_begin,
                       unsigned layer_end) const override {
+    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
     uint32_t tree_begin, tree_end;
     std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
     auto n_groups = model_.learner_model_param->num_output_group;
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index eb99822f3..b64532c61 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -139,14 +139,22 @@ struct DartTrainParam : public XGBoostParameter<DartTrainParam> {
 
 namespace detail {
 // From here on, layer becomes concrete trees.
-inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const &model,
-                                                 size_t layer_begin,
-                                                 size_t layer_end) {
-  bst_group_t groups = model.learner_model_param->num_output_group;
-  uint32_t tree_begin = layer_begin * groups * model.param.num_parallel_tree;
-  uint32_t tree_end = layer_end * groups * model.param.num_parallel_tree;
+inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const& model,
+                                                 std::uint32_t layer_begin,
+                                                 std::uint32_t layer_end) {
+  std::uint32_t tree_begin;
+  std::uint32_t tree_end;
+  if (model.learner_model_param->IsVectorLeaf()) {
+    tree_begin = layer_begin * model.param.num_parallel_tree;
+    tree_end = layer_end * model.param.num_parallel_tree;
+  } else {
+    bst_group_t groups = model.learner_model_param->OutputLength();
+    tree_begin = layer_begin * groups * model.param.num_parallel_tree;
+    tree_end = layer_end * groups * model.param.num_parallel_tree;
+  }
+
   if (tree_end == 0) {
-    tree_end = static_cast<uint32_t>(model.trees.size());
+    tree_end = model.trees.size();
   }
   if (model.trees.size() != 0) {
     CHECK_LE(tree_begin, tree_end);
@@ -234,22 +242,25 @@ class GBTree : public GradientBooster {
   void LoadModel(Json const& in) override;
 
   // Number of trees per layer.
-  auto LayerTrees() const {
-    auto n_trees = model_.learner_model_param->num_output_group * model_.param.num_parallel_tree;
-    return n_trees;
+  [[nodiscard]] std::uint32_t LayerTrees() const {
+    if (model_.learner_model_param->IsVectorLeaf()) {
+      return model_.param.num_parallel_tree;
+    }
+    return model_.param.num_parallel_tree * model_.learner_model_param->OutputLength();
   }
 
   // slice the trees, out must be already allocated
   void Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
              GradientBooster *out, bool* out_of_bound) const override;
 
-  int32_t BoostedRounds() const override {
+  [[nodiscard]] std::int32_t BoostedRounds() const override {
     CHECK_NE(model_.param.num_parallel_tree, 0);
     CHECK_NE(model_.learner_model_param->num_output_group, 0);
+
     return model_.trees.size() / this->LayerTrees();
   }
 
-  bool ModelFitted() const override {
+  [[nodiscard]] bool ModelFitted() const override {
     return !model_.trees.empty() || !model_.trees_to_update.empty();
   }
 
diff --git a/src/learner.cc b/src/learner.cc
index 14f57a5ba..9b1d65ce6 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -326,7 +326,7 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
   std::string booster;
   std::string objective;
   // This is a training parameter and is not saved (nor loaded) in the model.
-  MultiStrategy multi_strategy{MultiStrategy::kComposite};
+  MultiStrategy multi_strategy{MultiStrategy::kOneOutputPerTree};
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(LearnerTrainParam) {
@@ -339,12 +339,12 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
         .set_default("reg:squarederror")
         .describe("Objective function used for obtaining gradient.");
     DMLC_DECLARE_FIELD(multi_strategy)
-        .add_enum("composite", MultiStrategy::kComposite)
-        .add_enum("monolithic", MultiStrategy::kMonolithic)
-        .set_default(MultiStrategy::kComposite)
+        .add_enum("one_output_per_tree", MultiStrategy::kOneOutputPerTree)
+        .add_enum("multi_output_tree", MultiStrategy::kMultiOutputTree)
+        .set_default(MultiStrategy::kOneOutputPerTree)
         .describe(
-            "Strategy used for training multi-target models. `monolithic` means building one "
-            "single tree for all targets.");
+            "Strategy used for training multi-target models. `multi_output_tree` means building "
+            "one single tree for all targets.");
   }
 };
 
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index 00116ebdb..386f0d53d 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -145,7 +145,6 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
   auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
 
   auto d_group_ptr = p_cache->DataGroupPtr(ctx);
-  auto n_groups = info.group_ptr_.size() - 1;
 
   auto d_inv_idcg = p_cache->InvIDCG(ctx);
   auto d_sorted_idx = p_cache->SortedIdx(ctx, d_predt.Values());
@@ -171,7 +170,6 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
                             HostDeviceVector<float> const &predt, bool minus,
                             std::shared_ptr<ltr::MAPCache> p_cache) {
   auto d_group_ptr = p_cache->DataGroupPtr(ctx);
-  auto n_groups = info.group_ptr_.size() - 1;
   auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
 
   predt.SetDevice(ctx->gpu_id);
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 0c045dda0..3d5dfbd67 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -87,30 +87,6 @@ bst_float PredValueByOneTree(const RegTree::FVec &p_feats, RegTree const &tree,
                               : GetLeafIndex<false, has_categorical>(tree, p_feats, cats);
   return tree[leaf].LeafValue();
 }
-
-void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
-                       const size_t tree_end, const size_t predict_offset,
-                       const std::vector<RegTree::FVec> &thread_temp, const size_t offset,
-                       const size_t block_size, linalg::TensorView<float, 2> out_predt) {
-  for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
-    const size_t gid = model.tree_info[tree_id];
-    auto const &tree = *model.trees[tree_id];
-    auto const &cats = tree.GetCategoriesMatrix();
-    auto has_categorical = tree.HasCategoricalSplit();
-
-    if (has_categorical) {
-      for (std::size_t i = 0; i < block_size; ++i) {
-        out_predt(predict_offset + i, gid) +=
-            PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
-      }
-    } else {
-      for (std::size_t i = 0; i < block_size; ++i) {
-        out_predt(predict_offset + i, gid) +=
-            PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
-      }
-    }
-  }
-}
 }  // namespace scalar
 
 namespace multi {
@@ -128,7 +104,7 @@ bst_node_t GetLeafIndex(MultiTargetTree const &tree, const RegTree::FVec &feat,
 }
 
 template <bool has_categorical>
-void PredValueByOneTree(const RegTree::FVec &p_feats, MultiTargetTree const &tree,
+void PredValueByOneTree(RegTree::FVec const &p_feats, MultiTargetTree const &tree,
                         RegTree::CategoricalSplitMatrix const &cats,
                         linalg::VectorView<float> out_predt) {
   bst_node_t const leaf = p_feats.HasMissing()
@@ -140,36 +116,52 @@ void PredValueByOneTree(const RegTree::FVec &p_feats, MultiTargetTree const &tre
     out_predt(i) += leaf_value(i);
   }
 }
+}  // namespace multi
 
-void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_begin,
-                       const size_t tree_end, const size_t predict_offset,
-                       const std::vector<RegTree::FVec> &thread_temp, const size_t offset,
-                       const size_t block_size, linalg::TensorView<float, 2> out_predt) {
-  for (size_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
+namespace {
+void PredictByAllTrees(gbm::GBTreeModel const &model, std::uint32_t const tree_begin,
+                       std::uint32_t const tree_end, std::size_t const predict_offset,
+                       std::vector<RegTree::FVec> const &thread_temp, std::size_t const offset,
+                       std::size_t const block_size, linalg::MatrixView<float> out_predt) {
+  for (std::uint32_t tree_id = tree_begin; tree_id < tree_end; ++tree_id) {
     auto const &tree = *model.trees.at(tree_id);
-    auto cats = tree.GetCategoriesMatrix();
+    auto const &cats = tree.GetCategoriesMatrix();
     bool has_categorical = tree.HasCategoricalSplit();
 
-    if (has_categorical) {
-      for (std::size_t i = 0; i < block_size; ++i) {
-        auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-        PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
-                                 t_predts);
+    if (tree.IsMultiTarget()) {
+      if (has_categorical) {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+          multi::PredValueByOneTree<true>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
+                                          t_predts);
+        }
+      } else {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
+          multi::PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(),
+                                           cats, t_predts);
+        }
       }
     } else {
-      for (std::size_t i = 0; i < block_size; ++i) {
-        auto t_predts = out_predt.Slice(predict_offset + i, linalg::All());
-        PredValueByOneTree<false>(thread_temp[offset + i], *tree.GetMultiTargetTree(), cats,
-                                  t_predts);
+      auto const gid = model.tree_info[tree_id];
+      if (has_categorical) {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          out_predt(predict_offset + i, gid) +=
+              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+        }
+      } else {
+        for (std::size_t i = 0; i < block_size; ++i) {
+          out_predt(predict_offset + i, gid) +=
+              scalar::PredValueByOneTree<true>(thread_temp[offset + i], tree, cats);
+        }
       }
     }
   }
 }
-}  // namespace multi
 
 template <typename DataView>
 void FVecFill(const size_t block_size, const size_t batch_offset, const int num_feature,
-              DataView* batch, const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
+              DataView *batch, const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
   for (size_t i = 0; i < block_size; ++i) {
     RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
     if (feats.Size() == 0) {
@@ -181,8 +173,8 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
 }
 
 template <typename DataView>
-void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batch,
-              const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
+void FVecDrop(const size_t block_size, const size_t batch_offset, DataView *batch,
+              const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
   for (size_t i = 0; i < block_size; ++i) {
     RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
     const SparsePage::Inst inst = (*batch)[batch_offset + i];
@@ -190,9 +182,7 @@ void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batc
   }
 }
 
-namespace {
 static std::size_t constexpr kUnroll = 8;
-}  // anonymous namespace
 
 struct SparsePageView {
   bst_row_t base_rowid;
@@ -292,7 +282,7 @@ class AdapterView {
 
 template <typename DataView, size_t block_of_rows_size>
 void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &model,
-                                     int32_t tree_begin, int32_t tree_end,
+                                     std::uint32_t tree_begin, std::uint32_t tree_end,
                                      std::vector<RegTree::FVec> *p_thread_temp, int32_t n_threads,
                                      linalg::TensorView<float, 2> out_predt) {
   auto &thread_temp = *p_thread_temp;
@@ -310,14 +300,8 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &mod
 
     FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, p_thread_temp);
     // process block of rows through all trees to keep cache locality
-    if (model.learner_model_param->IsVectorLeaf()) {
-      multi::PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid,
-                               thread_temp, fvec_offset, block_size, out_predt);
-    } else {
-      scalar::PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid,
-                                thread_temp, fvec_offset, block_size, out_predt);
-    }
-
+    PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
+                      fvec_offset, block_size, out_predt);
     FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
   });
 }
@@ -348,7 +332,6 @@ void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
   FillNodeMeanValues(tree, 0, mean_values);
 }
 
-namespace {
 // init thread buffers
 static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
   int prev_thread_temp_size = out->size();
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index ecd399e22..4a5c5b104 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -411,7 +411,7 @@ class DeviceModel {
 
     this->tree_beg_ = tree_begin;
     this->tree_end_ = tree_end;
-    this->num_group = model.learner_model_param->num_output_group;
+    this->num_group = model.learner_model_param->OutputLength();
   }
 };
 
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index 50b90f244..562a0b2d4 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -306,9 +306,9 @@ class HistogramBuilder {
 
 // Construct a work space for building histogram.  Eventually we should move this
 // function into histogram builder once hist tree method supports external memory.
-template <typename Partitioner>
+template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
 common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
-                                          std::vector<CPUExpandEntry> const &nodes_to_build) {
+                                          std::vector<ExpandEntry> const &nodes_to_build) {
   std::vector<size_t> partition_size(nodes_to_build.size(), 0);
   for (auto const &partition : partitioners) {
     size_t k = 0;
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 8f297f46d..7550904b5 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -889,6 +889,8 @@ void RegTree::Save(dmlc::Stream* fo) const {
   CHECK_EQ(param_.num_nodes, static_cast<int>(stats_.size()));
   CHECK_EQ(param_.deprecated_num_roots, 1);
   CHECK_NE(param_.num_nodes, 0);
+  CHECK(!IsMultiTarget())
+      << "Please use JSON/UBJSON for saving models with multi-target trees.";
   CHECK(!HasCategoricalSplit())
       << "Please use JSON/UBJSON for saving models with categorical splits.";
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 7e5955dc8..012b8e781 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -4,36 +4,39 @@
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
  */
-#include <algorithm>                         // for max
+#include <algorithm>                         // for max, copy, transform
 #include <cstddef>                           // for size_t
-#include <cstdint>                           // for uint32_t
-#include <memory>                            // for unique_ptr, allocator, make_unique, make_shared
-#include <ostream>                           // for operator<<, char_traits, basic_ostream
-#include <tuple>                             // for apply
+#include <cstdint>                           // for uint32_t, int32_t
+#include <memory>                            // for unique_ptr, allocator, make_unique, shared_ptr
+#include <numeric>                           // for accumulate
+#include <ostream>                           // for basic_ostream, char_traits, operator<<
 #include <utility>                           // for move, swap
 #include <vector>                            // for vector
 
 #include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
 #include "../collective/communicator.h"      // for Operation
 #include "../common/hist_util.h"             // for HistogramCuts, HistCollection
+#include "../common/linalg_op.h"             // for begin, cbegin, cend
 #include "../common/random.h"                // for ColumnSampler
 #include "../common/threading_utils.h"       // for ParallelFor
 #include "../common/timer.h"                 // for Monitor
+#include "../common/transform_iterator.h"    // for IndexTransformIter, MakeIndexTransformIter
 #include "../data/gradient_index.h"          // for GHistIndexMatrix
 #include "common_row_partitioner.h"          // for CommonRowPartitioner
+#include "dmlc/omp.h"                        // for omp_get_thread_num
 #include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
 #include "driver.h"                          // for Driver
-#include "hist/evaluate_splits.h"            // for HistEvaluator, UpdatePredictionCacheImpl
-#include "hist/expand_entry.h"               // for CPUExpandEntry
+#include "hist/evaluate_splits.h"            // for HistEvaluator, HistMultiEvaluator, UpdatePre...
+#include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
 #include "hist/histogram.h"                  // for HistogramBuilder, ConstructHistSpace
 #include "hist/sampler.h"                    // for SampleGradient
-#include "param.h"                           // for TrainParam, GradStats
-#include "xgboost/base.h"                    // for GradientPair, GradientPairInternal, bst_node_t
+#include "param.h"                           // for TrainParam, SplitEntryContainer, GradStats
+#include "xgboost/base.h"                    // for GradientPairInternal, GradientPair, bst_targ...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for BatchIterator, BatchSet, DMatrix, MetaInfo
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
-#include "xgboost/linalg.h"                  // for TensorView, MatrixView, UnravelIndex, All
-#include "xgboost/logging.h"                 // for LogCheck_EQ, LogCheck_GE, CHECK_EQ, LOG, LOG...
+#include "xgboost/linalg.h"                  // for All, MatrixView, TensorView, Matrix, Empty
+#include "xgboost/logging.h"                 // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
 #include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"             // for operator<<
 #include "xgboost/task.h"                    // for ObjInfo
@@ -105,6 +108,212 @@ void UpdateTree(common::Monitor *monitor_, linalg::MatrixView<GradientPair const
   monitor_->Stop(__func__);
 }
 
+/**
+ * \brief Updater for building multi-target trees. The implementation simply iterates over
+ *        each target.
+ */
+class MultiTargetHistBuilder {
+ private:
+  common::Monitor *monitor_{nullptr};
+  TrainParam const *param_{nullptr};
+  std::shared_ptr<common::ColumnSampler> col_sampler_;
+  std::unique_ptr<HistMultiEvaluator> evaluator_;
+  // Histogram builder for each target.
+  std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
+  Context const *ctx_{nullptr};
+  // Partitioner for each data batch.
+  std::vector<CommonRowPartitioner> partitioner_;
+  // Pointer to last updated tree, used for update prediction cache.
+  RegTree const *p_last_tree_{nullptr};
+
+  ObjInfo const *task_{nullptr};
+
+ public:
+  void UpdatePosition(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> const &applied) {
+    monitor_->Start(__func__);
+    std::size_t page_id{0};
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+      this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
+      page_id++;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void ApplyTreeSplit(MultiExpandEntry const &candidate, RegTree *p_tree) {
+    this->evaluator_->ApplyTreeSplit(candidate, p_tree);
+  }
+
+  void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
+    monitor_->Start(__func__);
+
+    std::size_t page_id = 0;
+    bst_bin_t n_total_bins = 0;
+    partitioner_.clear();
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      if (n_total_bins == 0) {
+        n_total_bins = page.cut.TotalBins();
+      } else {
+        CHECK_EQ(n_total_bins, page.cut.TotalBins());
+      }
+      partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->IsColumnSplit());
+      page_id++;
+    }
+
+    bst_target_t n_targets = p_tree->NumTargets();
+    histogram_builder_.clear();
+    for (std::size_t i = 0; i < n_targets; ++i) {
+      histogram_builder_.emplace_back();
+      histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
+                                      collective::IsDistributed(), p_fmat->IsColumnSplit());
+    }
+
+    evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
+    p_last_tree_ = p_tree;
+    monitor_->Stop(__func__);
+  }
+
+  MultiExpandEntry InitRoot(DMatrix *p_fmat, linalg::MatrixView<GradientPair const> gpair,
+                            RegTree *p_tree) {
+    monitor_->Start(__func__);
+    MultiExpandEntry best;
+    best.nid = RegTree::kRoot;
+    best.depth = 0;
+
+    auto n_targets = p_tree->NumTargets();
+    linalg::Matrix<GradientPairPrecise> root_sum_tloc =
+        linalg::Empty<GradientPairPrecise>(ctx_, ctx_->Threads(), n_targets);
+    CHECK_EQ(root_sum_tloc.Shape(1), gpair.Shape(1));
+    auto h_root_sum_tloc = root_sum_tloc.HostView();
+    common::ParallelFor(gpair.Shape(0), ctx_->Threads(), [&](auto i) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        h_root_sum_tloc(omp_get_thread_num(), t) += GradientPairPrecise{gpair(i, t)};
+      }
+    });
+    // Aggregate to the first row.
+    auto root_sum = h_root_sum_tloc.Slice(0, linalg::All());
+    for (std::int32_t tidx{1}; tidx < ctx_->Threads(); ++tidx) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        root_sum(t) += h_root_sum_tloc(tidx, t);
+      }
+    }
+    CHECK(root_sum.CContiguous());
+    collective::Allreduce<collective::Operation::kSum>(
+        reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2);
+
+    std::vector<MultiExpandEntry> nodes{best};
+    std::size_t i = 0;
+    auto space = ConstructHistSpace(partitioner_, nodes);
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
+                                        nodes, {}, t_gpair.Values());
+      }
+      i++;
+    }
+
+    auto weight = evaluator_->InitRoot(root_sum);
+    auto weight_t = weight.HostView();
+    std::transform(linalg::cbegin(weight_t), linalg::cend(weight_t), linalg::begin(weight_t),
+                   [&](float w) { return w * param_->learning_rate; });
+
+    p_tree->SetLeaf(RegTree::kRoot, weight_t);
+    std::vector<common::HistCollection const *> hists;
+    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
+      hists.push_back(&histogram_builder_[t].Histogram());
+    }
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
+      break;
+    }
+    monitor_->Stop(__func__);
+
+    return nodes.front();
+  }
+
+  void BuildHistogram(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> const &valid_candidates,
+                      linalg::MatrixView<GradientPair const> gpair) {
+    monitor_->Start(__func__);
+    std::vector<MultiExpandEntry> nodes_to_build;
+    std::vector<MultiExpandEntry> nodes_to_sub;
+
+    for (auto const &c : valid_candidates) {
+      auto left_nidx = p_tree->LeftChild(c.nid);
+      auto right_nidx = p_tree->RightChild(c.nid);
+
+      auto build_nidx = left_nidx;
+      auto subtract_nidx = right_nidx;
+      auto lit =
+          common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
+      auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
+      auto rit =
+          common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
+      auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
+      auto fewer_right = right_sum < left_sum;
+      if (fewer_right) {
+        std::swap(build_nidx, subtract_nidx);
+      }
+      nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
+      nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
+    }
+
+    std::size_t i = 0;
+    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        // Make sure the gradient matrix is f-order.
+        CHECK(t_gpair.Contiguous());
+        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
+                                        nodes_to_build, nodes_to_sub, t_gpair.Values());
+      }
+      i++;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
+                      std::vector<MultiExpandEntry> *best_splits) {
+    monitor_->Start(__func__);
+    std::vector<common::HistCollection const *> hists;
+    for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
+      hists.push_back(&histogram_builder_[t].Histogram());
+    }
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
+      break;
+    }
+    monitor_->Stop(__func__);
+  }
+
+  void LeafPartition(RegTree const &tree, linalg::MatrixView<GradientPair const> gpair,
+                     std::vector<bst_node_t> *p_out_position) {
+    monitor_->Start(__func__);
+    if (!task_->UpdateTreeLeaf()) {
+      return;
+    }
+    for (auto const &part : partitioner_) {
+      part.LeafPartition(ctx_, tree, gpair, p_out_position);
+    }
+    monitor_->Stop(__func__);
+  }
+
+ public:
+  explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
+                                  std::shared_ptr<common::ColumnSampler> column_sampler,
+                                  ObjInfo const *task, common::Monitor *monitor)
+      : monitor_{monitor},
+        param_{param},
+        col_sampler_{std::move(column_sampler)},
+        evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
+        ctx_{ctx},
+        task_{task} {
+    monitor_->Init(__func__);
+  }
+};
+
 class HistBuilder {
  private:
   common::Monitor *monitor_;
@@ -155,8 +364,7 @@ class HistBuilder {
   // initialize temp data structure
   void InitData(DMatrix *fmat, RegTree const *p_tree) {
     monitor_->Start(__func__);
-
-    size_t page_id{0};
+    std::size_t page_id{0};
     bst_bin_t n_total_bins{0};
     partitioner_.clear();
     for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
@@ -195,7 +403,7 @@ class HistBuilder {
                           RegTree *p_tree) {
     CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
 
-    size_t page_id = 0;
+    std::size_t page_id = 0;
     auto space = ConstructHistSpace(partitioner_, {node});
     for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
       std::vector<CPUExpandEntry> nodes_to_build{node};
@@ -214,13 +422,13 @@ class HistBuilder {
          * of gradient histogram is equal to snode[nid]
          */
         auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
-        std::vector<uint32_t> const &row_ptr = gmat.cut.Ptrs();
+        std::vector<std::uint32_t> const &row_ptr = gmat.cut.Ptrs();
         CHECK_GE(row_ptr.size(), 2);
-        uint32_t const ibegin = row_ptr[0];
-        uint32_t const iend = row_ptr[1];
+        std::uint32_t const ibegin = row_ptr[0];
+        std::uint32_t const iend = row_ptr[1];
         auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
         auto begin = hist.data();
-        for (uint32_t i = ibegin; i < iend; ++i) {
+        for (std::uint32_t i = ibegin; i < iend; ++i) {
           GradientPairPrecise const &et = begin[i];
           grad_stat.Add(et.GetGrad(), et.GetHess());
         }
@@ -259,7 +467,7 @@ class HistBuilder {
     std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
     std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
 
-    size_t n_idx = 0;
+    std::size_t n_idx = 0;
     for (auto const &c : valid_candidates) {
       auto left_nidx = (*p_tree)[c.nid].LeftChild();
       auto right_nidx = (*p_tree)[c.nid].RightChild();
@@ -275,7 +483,7 @@ class HistBuilder {
       n_idx++;
     }
 
-    size_t page_id{0};
+    std::size_t page_id{0};
     auto space = ConstructHistSpace(partitioner_, nodes_to_build);
     for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
       histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
@@ -311,11 +519,12 @@ class HistBuilder {
 
 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker : public TreeUpdater {
-  std::unique_ptr<HistBuilder> p_impl_;
+  std::unique_ptr<HistBuilder> p_impl_{nullptr};
+  std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
   std::shared_ptr<common::ColumnSampler> column_sampler_ =
       std::make_shared<common::ColumnSampler>();
   common::Monitor monitor_;
-  ObjInfo const *task_;
+  ObjInfo const *task_{nullptr};
 
  public:
   explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
@@ -332,7 +541,10 @@ class QuantileHistMaker : public TreeUpdater {
               const std::vector<RegTree *> &trees) override {
     if (trees.front()->IsMultiTarget()) {
       CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
-      LOG(FATAL) << "Not implemented.";
+      if (!p_mtimpl_) {
+        this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
+            ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
+      }
     } else {
       if (!p_impl_) {
         p_impl_ =
@@ -355,13 +567,14 @@ class QuantileHistMaker : public TreeUpdater {
 
     for (auto tree_it = trees.begin(); tree_it != trees.end(); ++tree_it) {
       if (need_copy()) {
-        // Copy gradient into buffer for sampling.
+        // Copy gradient into buffer for sampling. This converts C-order to F-order.
         std::copy(linalg::cbegin(h_gpair), linalg::cend(h_gpair), linalg::begin(h_sample_out));
       }
       SampleGradient(ctx_, *param, h_sample_out);
       auto *h_out_position = &out_position[tree_it - trees.begin()];
       if ((*tree_it)->IsMultiTarget()) {
-        LOG(FATAL) << "Not implemented.";
+        UpdateTree<MultiExpandEntry>(&monitor_, h_sample_out, p_mtimpl_.get(), p_fmat, param,
+                                     h_out_position, *tree_it);
       } else {
         UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
                                    h_out_position, *tree_it);
@@ -372,6 +585,9 @@ class QuantileHistMaker : public TreeUpdater {
   bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
     if (p_impl_) {
       return p_impl_->UpdatePredictionCache(data, out_preds);
+    } else if (p_mtimpl_) {
+      // Not yet supported.
+      return false;
     } else {
       return false;
     }
@@ -383,6 +599,6 @@ class QuantileHistMaker : public TreeUpdater {
 XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
     .describe("Grow tree using quantized histogram.")
     .set_body([](Context const *ctx, ObjInfo const *task) {
-      return new QuantileHistMaker(ctx, task);
+      return new QuantileHistMaker{ctx, task};
     });
 }  // namespace xgboost::tree
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 8d601f355..b7864bb50 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -3,7 +3,7 @@ import os
 import subprocess
 import sys
 from multiprocessing import Pool, cpu_count
-from typing import Dict, Optional, Tuple
+from typing import Dict, Tuple
 
 from pylint import epylint
 from test_utils import PY_PACKAGE, ROOT, cd, print_time, record_time
@@ -15,8 +15,11 @@ SRCPATH = os.path.normpath(
 
 
 @record_time
-def run_black(rel_path: str) -> bool:
-    cmd = ["black", "-q", "--check", rel_path]
+def run_black(rel_path: str, fix: bool) -> bool:
+    if fix:
+        cmd = ["black", "-q", rel_path]
+    else:
+        cmd = ["black", "-q", "--check", rel_path]
     ret = subprocess.run(cmd).returncode
     if ret != 0:
         subprocess.run(["black", "--version"])
@@ -31,8 +34,11 @@ Please run the following command on your machine to address the formatting error
 
 
 @record_time
-def run_isort(rel_path: str) -> bool:
-    cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
+def run_isort(rel_path: str, fix: bool) -> bool:
+    if fix:
+        cmd = ["isort", f"--src={SRCPATH}", "--profile=black", rel_path]
+    else:
+        cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
     ret = subprocess.run(cmd).returncode
     if ret != 0:
         subprocess.run(["isort", "--version"])
@@ -132,7 +138,7 @@ def run_pylint() -> bool:
 def main(args: argparse.Namespace) -> None:
     if args.format == 1:
         black_results = [
-            run_black(path)
+            run_black(path, args.fix)
             for path in [
                 # core
                 "python-package/",
@@ -166,7 +172,7 @@ def main(args: argparse.Namespace) -> None:
             sys.exit(-1)
 
         isort_results = [
-            run_isort(path)
+            run_isort(path, args.fix)
             for path in [
                 # core
                 "python-package/",
@@ -230,6 +236,11 @@ if __name__ == "__main__":
     parser.add_argument("--format", type=int, choices=[0, 1], default=1)
     parser.add_argument("--type-check", type=int, choices=[0, 1], default=1)
     parser.add_argument("--pylint", type=int, choices=[0, 1], default=1)
+    parser.add_argument(
+        "--fix",
+        action="store_true",
+        help="Fix the formatting issues instead of emitting an error.",
+    )
     args = parser.parse_args()
     try:
         main(args)
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index c96b98497..270eacf21 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -412,7 +412,7 @@ std::pair<Json, Json> TestModelSlice(std::string booster) {
     j++;
   }
 
-  // CHECK sliced model doesn't have dependency on old one
+  // CHECK sliced model doesn't have dependency on the old one
   learner.reset();
   CHECK_EQ(sliced->GetNumFeature(), kCols);
 
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index a059f0436..c83544413 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -473,7 +473,7 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint
                                 int32_t device = Context::kCpuId) {
   size_t shape[1]{1};
   LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
-                           n_groups, 1, MultiStrategy::kComposite);
+                           n_groups, 1, MultiStrategy::kOneOutputPerTree);
   return mparam;
 }
 
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 4570a010d..d6cf33445 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -428,7 +428,7 @@ void TestVectorLeafPrediction(Context const *ctx) {
 
   LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
                            linalg::Vector<float>{{0.5}, {1}, Context::kCpuId}, 1, 3,
-                           MultiStrategy::kMonolithic};
+                           MultiStrategy::kMultiOutputTree};
 
   std::vector<std::unique_ptr<RegTree>> trees;
   trees.emplace_back(new RegTree{mparam.LeafLength(), mparam.num_feature});
diff --git a/tests/cpp/test_multi_target.cc b/tests/cpp/test_multi_target.cc
index d2e34235c..c8d371941 100644
--- a/tests/cpp/test_multi_target.cc
+++ b/tests/cpp/test_multi_target.cc
@@ -124,11 +124,11 @@ TEST(MultiStrategy, Configure) {
   auto p_fmat = RandomDataGenerator{12ul, 3ul, 0.0}.GenerateDMatrix();
   p_fmat->Info().labels.Reshape(p_fmat->Info().num_row_, 2);
   std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-  learner->SetParams(Args{{"multi_strategy", "monolithic"}, {"num_target", "2"}});
+  learner->SetParams(Args{{"multi_strategy", "multi_output_tree"}, {"num_target", "2"}});
   learner->Configure();
   ASSERT_EQ(learner->Groups(), 2);
 
-  learner->SetParams(Args{{"multi_strategy", "monolithic"}, {"num_target", "0"}});
+  learner->SetParams(Args{{"multi_strategy", "multi_output_tree"}, {"num_target", "0"}});
   ASSERT_THROW({ learner->Configure(); }, dmlc::Error);
 }
 }  // namespace xgboost
diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py
index b8be5dda1..50bbc3f1c 100644
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -116,7 +116,7 @@ def test_with_mq2008(objective, metric) -> None:
         x_valid,
         y_valid,
         qid_valid,
-    ) = tm.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))
+    ) = tm.data.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))
 
     if metric.find("map") != -1 or objective.find("map") != -1:
         y_train[y_train <= 1] = 0.0
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 6b28296b2..ea8d5dcb5 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -32,6 +32,19 @@ def train_result(param, dmat: xgb.DMatrix, num_rounds: int) -> dict:
     return result
 
 
+class TestGPUUpdatersMulti:
+    @given(
+        hist_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
+    )
+    @settings(deadline=None, max_examples=50, print_blob=True)
+    def test_hist(self, param, num_rounds, dataset):
+        param["tree_method"] = "gpu_hist"
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+
 class TestGPUUpdaters:
     cputest = test_up.TestTreeMethod()
 
@@ -101,7 +114,7 @@ class TestGPUUpdaters:
     ) -> None:
         cat_parameters.update(hist_parameters)
         dataset = tm.TestDataset(
-            "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
+            "ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
         )
         cat_parameters["tree_method"] = "gpu_hist"
         results = train_result(cat_parameters, dataset.get_dmat(), 16)
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index acacc55f8..d03ce142b 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -15,13 +15,17 @@ rng = np.random.RandomState(1994)
 
 
 def json_model(model_path: str, parameters: dict) -> dict:
-    X = np.random.random((10, 3))
-    y = np.random.randint(2, size=(10,))
+    datasets = pytest.importorskip("sklearn.datasets")
+
+    X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6)
+    if parameters.get("objective", None) == "multi:softmax":
+        parameters["num_class"] = 3
 
     dm1 = xgb.DMatrix(X, y)
 
     bst = xgb.train(parameters, dm1)
     bst.save_model(model_path)
+
     if model_path.endswith("ubj"):
         import ubjson
         with open(model_path, "rb") as ubjfd:
@@ -326,24 +330,43 @@ class TestModels:
         from_ubjraw = xgb.Booster()
         from_ubjraw.load_model(ubj_raw)
 
-        old_from_json = from_jraw.save_raw(raw_format="deprecated")
-        old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
+        if parameters.get("multi_strategy", None) != "multi_output_tree":
+            # old binary model is not supported.
+            old_from_json = from_jraw.save_raw(raw_format="deprecated")
+            old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
 
-        assert old_from_json == old_from_ubj
+            assert old_from_json == old_from_ubj
 
         raw_json = bst.save_raw(raw_format="json")
         pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n"
         bst.load_model(bytearray(pretty, encoding="ascii"))
 
-        old_from_json = from_jraw.save_raw(raw_format="deprecated")
-        old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
+        if parameters.get("multi_strategy", None) != "multi_output_tree":
+            # old binary model is not supported.
+            old_from_json = from_jraw.save_raw(raw_format="deprecated")
+            old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
 
-        assert old_from_json == old_from_ubj
+            assert old_from_json == old_from_ubj
+
+        rng = np.random.default_rng()
+        X = rng.random(size=from_jraw.num_features() * 10).reshape(
+            (10, from_jraw.num_features())
+        )
+        predt_from_jraw = from_jraw.predict(xgb.DMatrix(X))
+        predt_from_bst = bst.predict(xgb.DMatrix(X))
+        np.testing.assert_allclose(predt_from_jraw, predt_from_bst)
 
     @pytest.mark.parametrize("ext", ["json", "ubj"])
     def test_model_json_io(self, ext: str) -> None:
         parameters = {"booster": "gbtree", "tree_method": "hist"}
         self.run_model_json_io(parameters, ext)
+        parameters = {
+            "booster": "gbtree",
+            "tree_method": "hist",
+            "multi_strategy": "multi_output_tree",
+            "objective": "multi:softmax",
+        }
+        self.run_model_json_io(parameters, ext)
         parameters = {"booster": "gblinear"}
         self.run_model_json_io(parameters, ext)
         parameters = {"booster": "dart", "tree_method": "hist"}
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index fabf8672e..e8375aa5e 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -465,7 +465,7 @@ class TestCallbacks:
                 assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".pkl"))
 
     def test_callback_list(self):
-        X, y = tm.get_california_housing()
+        X, y = tm.data.get_california_housing()
         m = xgb.DMatrix(X, y)
         callbacks = [xgb.callback.EarlyStopping(rounds=10)]
         for i in range(4):
diff --git a/tests/python/test_ranking.py b/tests/python/test_ranking.py
index 239271ec7..30de920f7 100644
--- a/tests/python/test_ranking.py
+++ b/tests/python/test_ranking.py
@@ -82,7 +82,7 @@ class TestRanking:
         """
         cls.dpath = 'demo/rank/'
         (x_train, y_train, qid_train, x_test, y_test, qid_test,
-         x_valid, y_valid, qid_valid) = tm.get_mq2008(cls.dpath)
+         x_valid, y_valid, qid_valid) = tm.data.get_mq2008(cls.dpath)
 
         # instantiate the matrices
         cls.dtrain = xgboost.DMatrix(x_train, y_train)
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index be72793e7..dd710f6a4 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -11,6 +11,7 @@ from xgboost import testing as tm
 from xgboost.testing.params import (
     cat_parameter_strategy,
     exact_parameter_strategy,
+    hist_multi_parameter_strategy,
     hist_parameter_strategy,
 )
 from xgboost.testing.updater import check_init_estimation, check_quantile_loss
@@ -18,11 +19,70 @@ from xgboost.testing.updater import check_init_estimation, check_quantile_loss
 
 def train_result(param, dmat, num_rounds):
     result = {}
-    xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
-              evals_result=result)
+    booster = xgb.train(
+        param,
+        dmat,
+        num_rounds,
+        [(dmat, "train")],
+        verbose_eval=False,
+        evals_result=result,
+    )
+    assert booster.num_features() == dmat.num_col()
+    assert booster.num_boosted_rounds() == num_rounds
+    assert booster.feature_names == dmat.feature_names
+    assert booster.feature_types == dmat.feature_types
+
     return result
 
 
+class TestTreeMethodMulti:
+    @given(
+        exact_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_exact(self, param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
+        if dataset.name.endswith("-l1"):
+            return
+        param["tree_method"] = "exact"
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.multi_dataset_strategy,
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_approx(self, param, hist_param, num_rounds, dataset):
+        param["tree_method"] = "approx"
+        param = dataset.set_params(param)
+        param.update(hist_param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    @given(
+        exact_parameter_strategy,
+        hist_multi_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.multi_dataset_strategy,
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_hist(
+        self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset
+    ) -> None:
+        if dataset.name.endswith("-l1"):
+            return
+        param["tree_method"] = "hist"
+        param = dataset.set_params(param)
+        param.update(hist_param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(result)
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+
 class TestTreeMethod:
     USE_ONEHOT = np.iinfo(np.int32).max
     USE_PART = 1
@@ -77,10 +137,14 @@ class TestTreeMethod:
         # Second prune should not change the tree
         assert after_prune == second_prune
 
-    @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.dataset_strategy
+    )
     @settings(deadline=None, print_blob=True)
-    def test_hist(self, param, hist_param, num_rounds, dataset):
+    def test_hist(self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
         param['tree_method'] = 'hist'
         param = dataset.set_params(param)
         param.update(hist_param)
@@ -88,23 +152,6 @@ class TestTreeMethod:
         note(result)
         assert tm.non_increasing(result['train'][dataset.metric])
 
-    @given(tm.sparse_datasets_strategy)
-    @settings(deadline=None, print_blob=True)
-    def test_sparse(self, dataset):
-        param = {"tree_method": "hist", "max_bin": 64}
-        hist_result = train_result(param, dataset.get_dmat(), 16)
-        note(hist_result)
-        assert tm.non_increasing(hist_result['train'][dataset.metric])
-
-        param = {"tree_method": "approx", "max_bin": 64}
-        approx_result = train_result(param, dataset.get_dmat(), 16)
-        note(approx_result)
-        assert tm.non_increasing(approx_result['train'][dataset.metric])
-
-        np.testing.assert_allclose(
-            hist_result["train"]["rmse"], approx_result["train"]["rmse"]
-        )
-
     def test_hist_categorical(self):
         # hist must be same as exact on all-categorial data
         dpath = 'demo/data/'
@@ -143,6 +190,23 @@ class TestTreeMethod:
         w = [0, 0, 1, 0]
         model.fit(X, y, sample_weight=w)
 
+    @given(tm.sparse_datasets_strategy)
+    @settings(deadline=None, print_blob=True)
+    def test_sparse(self, dataset):
+        param = {"tree_method": "hist", "max_bin": 64}
+        hist_result = train_result(param, dataset.get_dmat(), 16)
+        note(hist_result)
+        assert tm.non_increasing(hist_result['train'][dataset.metric])
+
+        param = {"tree_method": "approx", "max_bin": 64}
+        approx_result = train_result(param, dataset.get_dmat(), 16)
+        note(approx_result)
+        assert tm.non_increasing(approx_result['train'][dataset.metric])
+
+        np.testing.assert_allclose(
+            hist_result["train"]["rmse"], approx_result["train"]["rmse"]
+        )
+
     def run_invalid_category(self, tree_method: str) -> None:
         rng = np.random.default_rng()
         # too large
@@ -365,7 +429,7 @@ class TestTreeMethod:
     ) -> None:
         cat_parameters.update(hist_parameters)
         dataset = tm.TestDataset(
-            "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
+            "ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
         )
         cat_parameters["tree_method"] = tree_method
         results = train_result(cat_parameters, dataset.get_dmat(), 16)
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 369dcd421..0bf952025 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1168,7 +1168,7 @@ def test_dask_aft_survival() -> None:
 
 def test_dask_ranking(client: "Client") -> None:
     dpath = "demo/rank/"
-    mq2008 = tm.get_mq2008(dpath)
+    mq2008 = tm.data.get_mq2008(dpath)
     data = []
     for d in mq2008:
         if isinstance(d, scipy.sparse.csr_matrix):

From 15a2724ff70357dee395996fb046dac2aaab20d0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 23 Mar 2023 01:31:46 +0800
Subject: [PATCH 29/32] Removed outdated configuration serialization logic.
 (#8942)

- `saved_params` is empty.
- `saved_configs_` contains `num_round`, which is not used anywhere inside xgboost.
---
 src/learner.cc | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/src/learner.cc b/src/learner.cc
index 9b1d65ce6..50d54c9fc 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -882,7 +882,6 @@ std::string const LearnerConfiguration::kEvalMetric {"eval_metric"};  // NOLINT
 
 class LearnerIO : public LearnerConfiguration {
  private:
-  std::set<std::string> saved_configs_ = {"num_round"};
   // Used to identify the offset of JSON string when
   // Will be removed once JSON takes over.  Right now we still loads some RDS files from R.
   std::string const serialisation_header_ { u8"CONFIG-offset:" };
@@ -1035,21 +1034,11 @@ class LearnerIO : public LearnerConfiguration {
     CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format";
 
     obj_.reset(ObjFunction::Create(tparam_.objective, &ctx_));
-    gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_,
-                                       &learner_model_param_));
+    gbm_.reset(GradientBooster::Create(tparam_.booster, &ctx_, &learner_model_param_));
     gbm_->Load(fi);
     if (mparam_.contain_extra_attrs != 0) {
       std::vector<std::pair<std::string, std::string> > attr;
       fi->Read(&attr);
-      for (auto& kv : attr) {
-        const std::string prefix = "SAVED_PARAM_";
-        if (kv.first.find(prefix) == 0) {
-          const std::string saved_param = kv.first.substr(prefix.length());
-          if (saved_configs_.find(saved_param) != saved_configs_.end()) {
-            cfg_[saved_param] = kv.second;
-          }
-        }
-      }
       attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
     }
     bool warn_old_model { false };
@@ -1132,16 +1121,6 @@ class LearnerIO : public LearnerConfiguration {
     std::vector<std::pair<std::string, std::string> > extra_attr;
     mparam.contain_extra_attrs = 1;
 
-    {
-      std::vector<std::string> saved_params;
-      for (const auto& key : saved_params) {
-        auto it = cfg_.find(key);
-        if (it != cfg_.end()) {
-          mparam.contain_extra_attrs = 1;
-          extra_attr.emplace_back("SAVED_PARAM_" + key, it->second);
-        }
-      }
-    }
     {
       // Similar to JSON model IO, we save the objective.
       Json j_obj { Object() };

From bf88dadb61ca87f6287d20618807ee7580239db5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 23 Mar 2023 03:27:04 +0800
Subject: [PATCH 30/32] [doc] Fix callback example. (#8944)

---
 python-package/xgboost/callback.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index 5be6a058a..6569f7e3d 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -324,7 +324,7 @@ class EarlyStopping(TrainingCallback):
 
             es = xgboost.callback.EarlyStopping(
                 rounds=2,
-                abs_tol=1e-3,
+                min_delta=1e-3,
                 save_best=True,
                 maximize=False,
                 data_name="validation_0",

From 21a52c7f983f0048968b51fd91381ddd584def07 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 23 Mar 2023 13:30:42 +0800
Subject: [PATCH 31/32] [doc] Add introduction and notes for the sklearn
 interface. (#8948)

---
 demo/guide-python/sklearn_examples.py |   3 +
 doc/python/index.rst                  |   1 +
 doc/python/python_api.rst             |   1 +
 doc/python/python_intro.rst           |   3 +-
 doc/python/sklearn_estimator.rst      | 162 ++++++++++++++++++++++++++
 python-package/xgboost/sklearn.py     |  50 ++++----
 6 files changed, 199 insertions(+), 21 deletions(-)
 create mode 100644 doc/python/sklearn_estimator.rst

diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
index 5890987f9..cf33e959a 100644
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -2,6 +2,9 @@
 Collection of examples for using sklearn interface
 ==================================================
 
+For an introduction to XGBoost's scikit-learn estimator interface, see
+:doc:`/python/sklearn_estimator`.
+
 Created on 1 Apr 2015
 
 @author: Jamie Hall
diff --git a/doc/python/index.rst b/doc/python/index.rst
index 60608700b..fd34e0d43 100644
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@@ -10,6 +10,7 @@ Contents
 
 .. toctree::
   python_intro
+  sklearn_estimator
   python_api
   callbacks
   model
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index b27542a8b..0cbf63456 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -41,6 +41,7 @@ Learning API
 
 Scikit-Learn API
 ----------------
+
 .. automodule:: xgboost.sklearn
 .. autoclass:: xgboost.XGBRegressor
     :members:
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index c36db91ff..505556383 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -305,7 +305,8 @@ Scikit-Learn interface
 ----------------------
 
 XGBoost provides an easy to use scikit-learn interface for some pre-defined models
-including regression, classification and ranking.
+including regression, classification and ranking. See :doc:`/python/sklearn_estimator`
+for more info.
 
 .. code-block:: python
 
diff --git a/doc/python/sklearn_estimator.rst b/doc/python/sklearn_estimator.rst
new file mode 100644
index 000000000..9748dbebd
--- /dev/null
+++ b/doc/python/sklearn_estimator.rst
@@ -0,0 +1,162 @@
+##########################################
+Using the Scikit-Learn Estimator Interface
+##########################################
+
+**Contents**
+
+.. contents::
+  :backlinks: none
+  :local:
+
+********
+Overview
+********
+
+In addition to the native interface, XGBoost features a sklearn estimator interface that
+conforms to `sklearn estimator guideline
+<https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator>`__. It
+supports regression, classification, and learning to rank. Survival training for the
+sklearn estimator interface is still working in progress.
+
+You can find some some quick start examples at
+:ref:`sphx_glr_python_examples_sklearn_examples.py`. The main advantage of using sklearn
+interface is that it works with most of the utilites provided by sklearn like
+:py:func:`sklearn.model_selection.cross_validate`. Also, many other libraries recognize
+the sklearn estimator interface thanks to its popularity.
+
+With the sklearn estimator interface, we can train a classification model with only a
+couple lines of Python code. Here's an example for training a classification model:
+
+.. code-block:: python
+
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.model_selection import train_test_split
+
+    import xgboost as xgb
+
+    X, y = load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=94)
+
+    # Use "hist" for constructing the trees, with early stopping enabled.
+    clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
+    # Fit the model, test sets are used for early stopping.
+    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+    # Save model into JSON format.
+    clf.save_model("clf.json")
+
+
+The ``tree_method`` parameter specifies the method to use for constructing the trees, and
+the early_stopping_rounds parameter enables early stopping. Early stopping can help
+prevent overfitting and save time during training.
+
+**************
+Early Stopping
+**************
+
+As demonstrated in the previous example, early stopping can be enabled by the parameter
+``early_stopping_rounds``. Alternatively, there's a callback function that can be used
+:py:class:`xgboost.callback.EarlyStopping` to specify more details about the behavior of
+early stopping, including whether XGBoost should return the best model instead of the full
+stack of trees:
+
+.. code-block:: python
+
+    early_stop = xgb.callback.EarlyStopping(
+        rounds=2, metric_name='logloss', data_name='Validation_0', save_best=True
+    )
+    clf = xgb.XGBClassifier(tree_method="hist", callbacks=[early_stop])
+    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+
+At present, XGBoost doesn't implement data spliting logic within the estimator and relies
+on the ``eval_set`` parameter of the :py:meth:`xgboost.XGBModel.fit` method. If you want
+to use early stopping to prevent overfitting, you'll need to manually split your data into
+training and testing sets using the :py:func:`sklearn.model_selection.train_test_split`
+function from the `sklearn` library. Some other machine learning algorithms, like those in
+`sklearn`, include early stopping as part of the estimator and may work with cross
+validation. However, using early stopping during cross validation may not be a perfect
+approach because it changes the model's number of trees for each validation fold, leading
+to different model. A better approach is to retrain the model after cross validation using
+the best hyperparameters along with early stopping. If you want to experiment with idea of
+using cross validation with early stopping, here is a snippet to begin with:
+
+.. code-block:: python
+
+    from sklearn.base import clone
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.model_selection import StratifiedKFold, cross_validate
+
+    import xgboost as xgb
+
+    X, y = load_breast_cancer(return_X_y=True)
+
+
+    def fit_and_score(estimator, X_train, X_test, y_train, y_test):
+        """Fit the estimator on the train set and score it on both sets"""
+        estimator.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+
+        train_score = estimator.score(X_train, y_train)
+        test_score = estimator.score(X_test, y_test)
+
+        return estimator, train_score, test_score
+
+
+    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=94)
+
+    clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=3)
+
+    resutls = {}
+
+    for train, test in cv.split(X, y):
+        X_train = X[train]
+        X_test = X[test]
+        y_train = y[train]
+        y_test = y[test]
+        est, train_score, test_score = fit_and_score(
+            clone(clf), X_train, X_test, y_train, y_test
+        )
+        resutls[est] = (train_score, test_score)
+
+
+***********************************
+Obtaining the native booster object
+***********************************
+
+The sklearn estimator interface primarily facilitates training and doesn't implement all
+features available in XGBoost. For instance, in order to have cached predictions,
+:py:class:`xgboost.DMatrix` needs to be used with :py:meth:`xgboost.Booster.predict`. One
+can obtain the booster object from the sklearn interface using
+:py:meth:`xgboost.XGBModel.get_booster`:
+
+.. code-block:: python
+
+   booster = clf.get_booster()
+   print(booster.num_boosted_rounds())
+
+
+**********
+Prediction
+**********
+
+When early stopping is enabled, prediction functions including the
+:py:meth:`xgboost.XGBModel.predict`, :py:meth:`xgboost.XGBModel.score`, and
+:py:meth:`xgboost.XGBModel.apply` methods will use the best model automatically. Meaning
+the :py:attr:`xgboost.XGBModel.best_iteration` is used to specify the range of trees used
+in prediction.
+
+To have cached results for incremental prediction, please use the
+:py:meth:`xgboost.Booster.predict` method instead.
+
+
+**************************
+Number of parallel threads
+**************************
+
+When working with XGBoost and other sklearn tools, you can specify how many threads you
+want to use by using the ``n_jobs`` parameter. By default, XGBoost uses all the available
+threads on your computer, which can lead to some interesting consequences when combined
+with other sklearn functions like :py:func:`sklearn.model_selection.cross_validate`. If
+both XGBoost and sklearn are set to use all threads, your computer may start to slow down
+significantly due to something called "thread thrashing". To avoid this, you can simply
+set the ``n_jobs`` parameter for XGBoost to `None` (which uses all threads) and the
+``n_jobs`` parameter for sklearn to `1`. This way, both programs will be able to work
+together smoothly without causing any unnecessary computer strain.
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 805eb75b3..52175981a 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -368,18 +368,21 @@ __model_doc = f"""
 
         .. versionadded:: 1.6.0
 
-        Activates early stopping. Validation metric needs to improve at least once in
-        every **early_stopping_rounds** round(s) to continue training.  Requires at least
-        one item in **eval_set** in :py:meth:`fit`.
+        - Activates early stopping. Validation metric needs to improve at least once in
+          every **early_stopping_rounds** round(s) to continue training.  Requires at
+          least one item in **eval_set** in :py:meth:`fit`.
 
-        The method returns the model from the last iteration (not the best one).  If
-        there's more than one item in **eval_set**, the last entry will be used for early
-        stopping.  If there's more than one metric in **eval_metric**, the last metric
-        will be used for early stopping.
+        - The method returns the model from the last iteration, not the best one, use a
+          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
+          model is preferred.
 
-        If early stopping occurs, the model will have three additional fields:
-        :py:attr:`best_score`, :py:attr:`best_iteration` and
-        :py:attr:`best_ntree_limit`.
+        - If there's more than one item in **eval_set**, the last entry will be used for
+          early stopping.  If there's more than one metric in **eval_metric**, the last
+          metric will be used for early stopping.
+
+        - If early stopping occurs, the model will have three additional fields:
+          :py:attr:`best_score`, :py:attr:`best_iteration` and
+          :py:attr:`best_ntree_limit`.
 
         .. note::
 
@@ -479,7 +482,9 @@ Parameters
         doc.extend([get_doc(i) for i in items])
         if end_note:
             doc.append(end_note)
-        full_doc = [header + "\n\n"]
+        full_doc = [
+            header + "\nSee :doc:`/python/sklearn_estimator` for more information.\n"
+        ]
         full_doc.extend(doc)
         cls.__doc__ = "".join(full_doc)
         return cls
@@ -1146,10 +1151,10 @@ class XGBModel(XGBModelBase):
         base_margin: Optional[ArrayLike] = None,
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> ArrayLike:
-        """Predict with `X`.  If the model is trained with early stopping, then `best_iteration`
-        is used automatically.  For tree models, when data is on GPU, like cupy array or
-        cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
-        automatically, otherwise it will run on CPU.
+        """Predict with `X`.  If the model is trained with early stopping, then
+        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
+        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
+        prediction is run on GPU automatically, otherwise it will run on CPU.
 
         .. note:: This function is only thread safe for `gbtree` and `dart`.
 
@@ -1224,8 +1229,8 @@ class XGBModel(XGBModelBase):
         ntree_limit: int = 0,
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> np.ndarray:
-        """Return the predicted leaf every tree for each sample. If the model is trained with
-        early stopping, then `best_iteration` is used automatically.
+        """Return the predicted leaf every tree for each sample. If the model is trained
+        with early stopping, then :py:attr:`best_iteration` is used automatically.
 
         Parameters
         ----------
@@ -1635,7 +1640,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
         base_margin: Optional[ArrayLike] = None,
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> np.ndarray:
-        """Predict the probability of each `X` example being of a given class.
+        """Predict the probability of each `X` example being of a given class. If the
+        model is trained with early stopping, then :py:attr:`best_iteration` is used
+        automatically.
 
         .. note:: This function is only thread safe for `gbtree` and `dart`.
 
@@ -1661,6 +1668,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
         prediction :
             a numpy array of shape array-like of shape (n_samples, n_classes) with the
             probability of each data example being of a given class.
+
         """
         # custom obj:      Do nothing as we don't know what to do.
         # softprob:        Do nothing, output is proba.
@@ -2122,11 +2130,13 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
         return super().apply(X, ntree_limit, iteration_range)
 
     def score(self, X: ArrayLike, y: ArrayLike) -> float:
-        """Evaluate score for data using the last evaluation metric.
+        """Evaluate score for data using the last evaluation metric. If the model is
+        trained with early stopping, then :py:attr:`best_iteration` is used
+        automatically.
 
         Parameters
         ----------
-        X : pd.DataFrame|cudf.DataFrame
+        X : Union[pd.DataFrame, cudf.DataFrame]
           Feature matrix. A DataFrame with a special `qid` column.
 
         y :

From cff50fe3efc92edb691a4ce6e710c6909525b23d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 23 Mar 2023 16:12:04 +0800
Subject: [PATCH 32/32] Bump hadoop.version from 3.3.4 to 3.3.5 in
 /jvm-packages (#8962)

Bumps `hadoop.version` from 3.3.4 to 3.3.5.

Updates `hadoop-hdfs` from 3.3.4 to 3.3.5

Updates `hadoop-common` from 3.3.4 to 3.3.5

---
updated-dependencies:
- dependency-name: org.apache.hadoop:hadoop-hdfs
  dependency-type: direct:production
  update-type: version-update:semver-patch
- dependency-name: org.apache.hadoop:hadoop-common
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml                 | 2 +-
 jvm-packages/xgboost4j-flink/pom.xml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index e662b762a..a5d219040 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -37,7 +37,7 @@
         <spark.version>3.1.1</spark.version>
         <scala.version>2.12.8</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
-        <hadoop.version>3.3.4</hadoop.version>
+        <hadoop.version>3.3.5</hadoop.version>
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index e48feb876..b8b757eae 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -51,7 +51,7 @@
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
-            <version>3.3.4</version>
+            <version>3.3.5</version>
         </dependency>
     </dependencies>