From ba9d24ff7ba70632403ebce3de931c769cd93c46 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Mon, 17 Apr 2023 12:48:23 -0700
Subject: [PATCH 01/34] Make sure metrics work with column-wise distributed
 training (#9020)

---
 src/metric/auc.cc                           |  14 +-
 src/metric/elementwise_metric.cu            |  14 +-
 src/metric/multiclass_metric.cu             |   4 +-
 src/metric/rank_metric.cc                   |  16 +-
 src/metric/survival_metric.cu               |   4 +-
 tests/cpp/helpers.cc                        |   9 +-
 tests/cpp/helpers.h                         |  31 +-
 tests/cpp/metric/test_auc.cc                | 271 ++------------
 tests/cpp/metric/test_auc.h                 | 249 +++++++++++++
 tests/cpp/metric/test_elementwise_metric.cc | 391 ++++----------------
 tests/cpp/metric/test_elementwise_metric.h  | 385 +++++++++++++++++++
 tests/cpp/metric/test_multiclass_metric.cc  |  98 +----
 tests/cpp/metric/test_multiclass_metric.h   |  91 +++++
 tests/cpp/metric/test_rank_metric.cc        | 190 ++--------
 tests/cpp/metric/test_rank_metric.h         | 191 ++++++++++
 tests/cpp/metric/test_survival_metric.cu    |  36 +-
 16 files changed, 1183 insertions(+), 811 deletions(-)
 create mode 100644 tests/cpp/metric/test_auc.h
 create mode 100644 tests/cpp/metric/test_elementwise_metric.h
 create mode 100644 tests/cpp/metric/test_multiclass_metric.h
 create mode 100644 tests/cpp/metric/test_rank_metric.h
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index a926c2c5b..2d4becfa8 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -116,8 +116,10 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
 
   // we have 2 averages going in here, first is among workers, second is among
   // classes. allreduce sums up fp/tp auc for each class.
-  collective::Allreduce<collective::Operation::kSum>(results.Values().data(),
-                                                     results.Values().size());
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kSum>(results.Values().data(),
+                                                       results.Values().size());
+  }
   double auc_sum{0};
   double tp_sum{0};
   for (size_t c = 0; c < n_classes; ++c) {
@@ -290,7 +292,9 @@ class EvalAUC : public MetricNoCache {
       }
 
       std::array<double, 2> results{auc, static_cast<double>(valid_groups)};
-      collective::Allreduce<collective::Operation::kSum>(results.data(), results.size());
+      if (info.IsRowSplit()) {
+        collective::Allreduce<collective::Operation::kSum>(results.data(), results.size());
+      }
       auc = results[0];
       valid_groups = static_cast<uint32_t>(results[1]);
 
@@ -319,7 +323,9 @@ class EvalAUC : public MetricNoCache {
       }
       double local_area = fp * tp;
       std::array<double, 2> result{auc, local_area};
-      collective::Allreduce<collective::Operation::kSum>(result.data(), result.size());
+      if (info.IsRowSplit()) {
+        collective::Allreduce<collective::Operation::kSum>(result.data(), result.size());
+      }
       std::tie(auc, local_area) = common::UnpackArr(std::move(result));
       if (local_area <= 0) {
         // the dataset across all workers have only positive or negative sample
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index 9006bdfca..b1c764047 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -198,7 +198,7 @@ class PseudoErrorLoss : public MetricNoCache {
           return std::make_tuple(v, wt);
         });
     double dat[2]{result.Residue(), result.Weights()};
-    if (collective::IsDistributed()) {
+    if (info.IsRowSplit()) {
       collective::Allreduce<collective::Operation::kSum>(dat, 2);
     }
     return EvalRowMAPE::GetFinal(dat[0], dat[1]);
@@ -367,7 +367,9 @@ struct EvalEWiseBase : public MetricNoCache {
         });
 
     double dat[2]{result.Residue(), result.Weights()};
-    collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    if (info.IsRowSplit()) {
+      collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    }
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -439,7 +441,9 @@ class QuantileError : public MetricNoCache {
     if (info.num_row_ == 0) {
       // empty DMatrix on distributed env
       double dat[2]{0.0, 0.0};
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
+      if (info.IsRowSplit()) {
+        collective::Allreduce<collective::Operation::kSum>(dat, 2);
+      }
       CHECK_GT(dat[1], 0);
       return dat[0] / dat[1];
     }
@@ -477,7 +481,9 @@ class QuantileError : public MetricNoCache {
           return std::make_tuple(l, w);
         });
     double dat[2]{result.Residue(), result.Weights()};
-    collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    if (info.IsRowSplit()) {
+      collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    }
     CHECK_GT(dat[1], 0);
     return dat[0] / dat[1];
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index aed6e7f4b..a1d19dbc8 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -181,7 +181,9 @@ struct EvalMClassBase : public MetricNoCache {
       dat[0] = result.Residue();
       dat[1] = result.Weights();
     }
-    collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    if (info.IsRowSplit()) {
+      collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    }
     return Derived::GetFinal(dat[0], dat[1]);
   }
   /*!
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index a84d0edb1..62efd0876 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -244,7 +244,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
       exc.Rethrow();
     }
 
-    if (collective::IsDistributed()) {
+    if (collective::IsDistributed() && info.IsRowSplit()) {
       double dat[2]{sum_metric, static_cast<double>(ngroups)};
       // approximately estimate the metric using mean
       collective::Allreduce<collective::Operation::kSum>(dat, 2);
@@ -401,9 +401,11 @@ class EvalRankWithCache : public Metric {
 };
 
 namespace {
-double Finalize(double score, double sw) {
+double Finalize(MetaInfo const& info, double score, double sw) {
   std::array<double, 2> dat{score, sw};
-  collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
+  }
   if (sw > 0.0) {
     score = score / sw;
   }
@@ -430,7 +432,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
               std::shared_ptr<ltr::NDCGCache> p_cache) override {
     if (ctx_->IsCUDA()) {
       auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
-      return Finalize(ndcg.Residue(), ndcg.Weights());
+      return Finalize(info, ndcg.Residue(), ndcg.Weights());
     }
 
     // group local ndcg
@@ -476,7 +478,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
       sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
     }
     auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
-    return Finalize(ndcg, sum_w);
+    return Finalize(info, ndcg, sum_w);
   }
 };
 
@@ -489,7 +491,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
               std::shared_ptr<ltr::MAPCache> p_cache) override {
     if (ctx_->IsCUDA()) {
       auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
-      return Finalize(map.Residue(), map.Weights());
+      return Finalize(info, map.Residue(), map.Weights());
     }
 
     auto gptr = p_cache->DataGroupPtr(ctx_);
@@ -532,7 +534,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
       sw += weight[i];
     }
     auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
-    return Finalize(sum, sw);
+    return Finalize(info, sum, sw);
   }
 };
 
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 8205f07a1..9b1773dc5 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -212,7 +212,9 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
                                   info.labels_upper_bound_, preds);
 
     double dat[2]{result.Residue(), result.Weights()};
-    collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    if (info.IsRowSplit()) {
+      collective::Allreduce<collective::Operation::kSum>(dat, 2);
+    }
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 0c0c9fc9f..a8b974f03 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -167,18 +167,20 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
                                  xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
                                  std::vector<xgboost::bst_float> labels,
                                  std::vector<xgboost::bst_float> weights,
-                                 std::vector<xgboost::bst_uint> groups) {
+                                 std::vector<xgboost::bst_uint> groups,
+                                 xgboost::DataSplitMode data_split_mode) {
   return GetMultiMetricEval(
       metric, preds,
       xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights,
-      groups);
+      groups, data_split_mode);
 }
 
 double GetMultiMetricEval(xgboost::Metric* metric,
                           xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
                           xgboost::linalg::Tensor<float, 2> const& labels,
                           std::vector<xgboost::bst_float> weights,
-                          std::vector<xgboost::bst_uint> groups) {
+                          std::vector<xgboost::bst_uint> groups,
+                          xgboost::DataSplitMode data_split_mode) {
   std::shared_ptr<xgboost::DMatrix> p_fmat{xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix()};
   auto& info = p_fmat->Info();
   info.num_row_ = labels.Shape(0);
@@ -186,6 +188,7 @@ double GetMultiMetricEval(xgboost::Metric* metric,
   info.labels.Data()->Copy(*labels.Data());
   info.weights_.HostVector() = weights;
   info.group_ptr_ = groups;
+  info.data_split_mode = data_split_mode;
 
   return metric->Evaluate(preds, p_fmat);
 }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 025feae3e..5e65a1636 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -39,6 +39,18 @@
 #define GPUIDX -1
 #endif
 
+#if defined(__CUDACC__)
+#define DeclareUnifiedDistributedTest(name) MGPU ## name
+#else
+#define DeclareUnifiedDistributedTest(name) name
+#endif
+
+#if defined(__CUDACC__)
+#define WORLD_SIZE_FOR_TEST (xgboost::common::AllVisibleGPUs())
+#else
+#define WORLD_SIZE_FOR_TEST (3)
+#endif
+
 namespace xgboost {
 class ObjFunction;
 class Metric;
@@ -92,13 +104,15 @@ xgboost::bst_float GetMetricEval(
   xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
   std::vector<xgboost::bst_float> labels,
   std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
-  std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());
+  std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>(),
+  xgboost::DataSplitMode data_split_Mode = xgboost::DataSplitMode::kRow);
 
 double GetMultiMetricEval(xgboost::Metric* metric,
                           xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
                           xgboost::linalg::Tensor<float, 2> const& labels,
                           std::vector<xgboost::bst_float> weights = {},
-                          std::vector<xgboost::bst_uint> groups = {});
+                          std::vector<xgboost::bst_uint> groups = {},
+                          xgboost::DataSplitMode data_split_Mode = xgboost::DataSplitMode::kRow);
 
 namespace xgboost {
 
@@ -496,4 +510,17 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
     thread.join();
   }
 }
+
+class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
+ protected:
+  int world_size_;
+
+  void SetUp() override {
+    world_size_ = WORLD_SIZE_FOR_TEST;
+    if (world_size_ <= 1) {
+      GTEST_SKIP() << "Skipping MGPU test with # GPUs = " << world_size_;
+    }
+  }
+};
+
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_auc.cc b/tests/cpp/metric/test_auc.cc
index 2a6738899..de42bba53 100644
--- a/tests/cpp/metric/test_auc.cc
+++ b/tests/cpp/metric/test_auc.cc
@@ -1,261 +1,68 @@
+#include "test_auc.h"
+
 #include <xgboost/metric.h>
-#include "../helpers.h"
 
 namespace xgboost {
 namespace metric {
 
-TEST(Metric, DeclareUnifiedTest(BinaryAUC)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> uni_ptr {Metric::Create("auc", &ctx)};
-  Metric * metric = uni_ptr.get();
-  ASSERT_STREQ(metric->Name(), "auc");
+TEST(Metric, DeclareUnifiedTest(BinaryAUC)) { VerifyBinaryAUC(); }
 
-  // Binary
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.0f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {1, 0}), 0.0f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {0, 1}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {0, 1}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {1, 0}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {1, 0}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {1, 0, 0}, {0, 0, 1}), 0.25f, 1e-10);
+TEST(Metric, DeclareUnifiedTest(MultiClassAUC)) { VerifyMultiClassAUC(); }
 
-  // Invalid dataset
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
-  float auc = metric->Evaluate({1, 1}, p_fmat);
-  ASSERT_TRUE(std::isnan(auc));
-  *info.labels.Data() = HostDeviceVector<float>{};
-  auc = metric->Evaluate(HostDeviceVector<float>{}, p_fmat);
-  ASSERT_TRUE(std::isnan(auc));
+TEST(Metric, DeclareUnifiedTest(RankingAUC)) { VerifyRankingAUC(); }
 
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1, 0, 1}, {0, 1, 0, 1}), 1.0f, 1e-10);
+TEST(Metric, DeclareUnifiedTest(PRAUC)) { VerifyPRAUC(); }
 
-  // AUC with instance weights
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.9f, 0.1f, 0.4f, 0.3f},
-                            {0,    0,    1,    1},
-                            {1.0f, 3.0f, 2.0f, 4.0f}),
-              0.75f, 0.001f);
+TEST(Metric, DeclareUnifiedTest(MultiClassPRAUC)) { VerifyMultiClassPRAUC(); }
 
-  // regression test case
-  ASSERT_NEAR(GetMetricEval(
-                  metric,
-                  {0.79523796, 0.5201713,  0.79523796, 0.24273258, 0.53452194,
-                   0.53452194, 0.24273258, 0.5201713,  0.79523796, 0.53452194,
-                   0.24273258, 0.53452194, 0.79523796, 0.5201713,  0.24273258,
-                   0.5201713,  0.5201713,  0.53452194, 0.5201713,  0.53452194},
-                  {0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0}),
-              0.5, 1e-10);
+TEST(Metric, DeclareUnifiedTest(RankingPRAUC)) { VerifyRankingPRAUC(); }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), BinaryAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyBinaryAUC, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(MultiClassAUC)) {
-  auto ctx = CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> uni_ptr{
-      Metric::Create("auc", &ctx)};
-  auto metric = uni_ptr.get();
-
-  // MultiClass
-  // 3x3
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {0, 1, 2}),
-              1.0f, 1e-10);
-
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {0, 1, 2},
-                            {1.0f, 1.0f, 1.0f}),
-              1.0f, 1e-10);
-
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {2, 1, 0}),
-              0.5f, 1e-10);
-
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {2, 0, 1}),
-              0.25f, 1e-10);
-
-  // invalid dataset
-  float auc = GetMetricEval(metric,
-                            {
-                                1.0f, 0.0f, 0.0f, // p_0
-                                0.0f, 1.0f, 0.0f, // p_1
-                                0.0f, 0.0f, 1.0f  // p_2
-                            },
-                            {0, 1, 1});  // no class 2.
-  EXPECT_TRUE(std::isnan(auc)) << auc;
-
-  HostDeviceVector<float> predts{
-    0.0f, 1.0f, 0.0f,
-    1.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 1.0f,
-    0.0f, 0.0f, 1.0f,
-  };
-  std::vector<float> labels {1.0f, 0.0f, 2.0f, 1.0f};
-  auc = GetMetricEval(metric, predts, labels, {1.0f, 2.0f, 3.0f, 4.0f});
-  ASSERT_GT(auc, 0.714);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), BinaryAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyBinaryAUC, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(RankingAUC)) {
-  auto ctx = CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> metric{Metric::Create("auc", &ctx)};
-
-  // single group
-  EXPECT_NEAR(GetMetricEval(metric.get(), {0.7f, 0.2f, 0.3f, 0.6f},
-                            {1.0f, 0.8f, 0.4f, 0.2f}, /*weights=*/{},
-                            {0, 4}),
-              0.5f, 1e-10);
-
-  // multi group
-  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2},
-                            {0, 1, 2, 0, 1, 2}, /*weights=*/{}, {0, 3, 6}),
-              1.0f, 1e-10);
-
-  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2},
-                            {0, 1, 2, 0, 1, 2}, /*weights=*/{1.0f, 2.0f},
-                            {0, 3, 6}),
-              1.0f, 1e-10);
-
-  // AUC metric for grouped datasets - exception scenarios
-  ASSERT_TRUE(std::isnan(
-      GetMetricEval(metric.get(), {0, 1, 2}, {0, 0, 0}, {}, {0, 2, 3})));
-
-  // regression case
-  HostDeviceVector<float> predt{0.33935383, 0.5149714,  0.32138085, 1.4547751,
-                                1.2010975,  0.42651367, 0.23104341, 0.83610827,
-                                0.8494239,  0.07136688, 0.5623144,  0.8086237,
-                                1.5066161,  -4.094787,  0.76887935, -2.4082742};
-  std::vector<bst_group_t> groups{0, 7, 16};
-  std::vector<float> labels{1., 0., 0., 1., 2., 1., 0., 0.,
-                            0., 0., 0., 0., 1., 0., 1., 0.};
-
-  EXPECT_NEAR(GetMetricEval(metric.get(), std::move(predt), labels,
-                            /*weights=*/{}, groups),
-              0.769841f, 1e-6);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassAUC, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(PRAUC)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  xgboost::Metric *metric = xgboost::Metric::Create("aucpr", &ctx);
-  ASSERT_STREQ(metric->Name(), "aucpr");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 0, 1, 1}, {0, 0, 1, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}),
-              0.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(
-                  metric,
-                  {0.4f, 0.2f, 0.9f, 0.1f, 0.2f, 0.4f, 0.1f, 0.1f, 0.2f, 0.1f},
-                  {0, 0, 0, 0, 0, 1, 0, 0, 1, 1}),
-              0.2908445f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(
-                  metric, {0.87f, 0.31f, 0.40f, 0.42f, 0.25f, 0.66f, 0.95f,
-                           0.09f, 0.10f, 0.97f, 0.76f, 0.69f, 0.15f, 0.20f,
-                           0.30f, 0.14f, 0.07f, 0.58f, 0.61f, 0.08f},
-                  {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}),
-              0.2769199f, 0.001f);
-  auto auc = GetMetricEval(metric, {0, 1}, {});
-  ASSERT_TRUE(std::isnan(auc));
-
-  // AUCPR with instance weights
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.29f, 0.52f, 0.11f, 0.21f, 0.219f, 0.93f, 0.493f,
-                             0.17f, 0.47f, 0.13f, 0.43f, 0.59f, 0.87f, 0.007f},
-                            {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0},
-                            {1, 2, 7, 4, 5, 2.2f, 3.2f, 5, 6, 1, 2, 1.1f, 3.2f,
-                             4.5f}), // weights
-              0.694435f, 0.001f);
-
-  // Both groups contain only pos or neg samples.
-  auc = GetMetricEval(metric,
-                      {0, 0.1f, 0.3f, 0.5f, 0.7f},
-                      {1, 1, 0, 0, 0},
-                      {},
-                      {0, 2, 5});
-  ASSERT_TRUE(std::isnan(auc));
-  delete metric;
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassAUC, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MultiClassPRAUC)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
-
-  float auc = 0;
-  std::vector<float> labels {1.0f, 0.0f, 2.0f};
-  HostDeviceVector<float> predts{
-    0.0f, 1.0f, 0.0f,
-    1.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 1.0f,
-  };
-  auc = GetMetricEval(metric.get(), predts, labels, {});
-  EXPECT_EQ(auc, 1.0f);
-
-  auc = GetMetricEval(metric.get(), predts, labels, {1.0f, 1.0f, 1.0f});
-  EXPECT_EQ(auc, 1.0f);
-
-  predts.HostVector() =  {
-    0.0f, 1.0f, 0.0f,
-    1.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 1.0f,
-    0.0f, 0.0f, 1.0f,
-  };
-  labels = {1.0f, 0.0f, 2.0f, 1.0f};
-  auc = GetMetricEval(metric.get(), predts, labels, {1.0f, 2.0f, 3.0f, 4.0f});
-  ASSERT_GT(auc, 0.699);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRankingAUC, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(RankingPRAUC)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRankingAUC, DataSplitMode::kCol);
+}
 
-  std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PRAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPRAUC, DataSplitMode::kRow);
+}
 
-  std::vector<float> labels {1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f};
-  std::vector<uint32_t> groups {0, 2, 6};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PRAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPRAUC, DataSplitMode::kCol);
+}
 
-  float auc = 0;
-  auc = GetMetricEval(metric.get(), {1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f}, labels, {}, groups);
-  EXPECT_EQ(auc, 1.0f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassPRAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassPRAUC, DataSplitMode::kRow);
+}
 
-  auc = GetMetricEval(metric.get(), {1.0f, 0.5f, 0.8f, 0.3f, 0.2f, 1.0f}, labels, {}, groups);
-  EXPECT_EQ(auc, 1.0f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassPRAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassPRAUC, DataSplitMode::kCol);
+}
 
-  auc = GetMetricEval(metric.get(), {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                      {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {}, groups);
-  ASSERT_TRUE(std::isnan(auc));
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingPRAUCRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRankingPRAUC, DataSplitMode::kRow);
+}
 
-  // Incorrect label
-  ASSERT_THROW(GetMetricEval(metric.get(), {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-                             {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 3.0f}, {}, groups),
-               dmlc::Error);
-
-  // AUCPR with groups and no weights
-  EXPECT_NEAR(GetMetricEval(
-      metric.get(), {0.87f, 0.31f, 0.40f, 0.42f, 0.25f, 0.66f, 0.95f,
-                     0.09f, 0.10f, 0.97f, 0.76f, 0.69f, 0.15f, 0.20f,
-                     0.30f, 0.14f, 0.07f, 0.58f, 0.61f, 0.08f},
-                  {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1},
-                  {},  // weights
-                  {0, 2, 5, 9, 14, 20}),  // group info
-              0.556021f, 0.001f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingPRAUCColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRankingPRAUC, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_auc.h b/tests/cpp/metric/test_auc.h
new file mode 100644
index 000000000..3baa53290
--- /dev/null
+++ b/tests/cpp/metric/test_auc.h
@@ -0,0 +1,249 @@
+/*!
+ * Copyright (c) 2023 by XGBoost Contributors
+ */
+#pragma once
+
+#include <xgboost/metric.h>
+
+#include "../helpers.h"
+
+namespace xgboost {
+namespace metric {
+
+inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> uni_ptr{Metric::Create("auc", &ctx)};
+  Metric* metric = uni_ptr.get();
+  ASSERT_STREQ(metric->Name(), "auc");
+
+  // Binary
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1.0f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {1, 0}, {}, {}, data_split_mode), 0.0f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {1, 0}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {1, 0}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 0, 0}, {0, 0, 1}, {}, {}, data_split_mode), 0.25f, 1e-10);
+
+  // Invalid dataset
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
+  float auc = metric->Evaluate({1, 1}, p_fmat);
+  ASSERT_TRUE(std::isnan(auc));
+  *info.labels.Data() = HostDeviceVector<float>{};
+  auc = metric->Evaluate(HostDeviceVector<float>{}, p_fmat);
+  ASSERT_TRUE(std::isnan(auc));
+
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1, 0, 1}, {0, 1, 0, 1}, {}, {}, data_split_mode), 1.0f,
+              1e-10);
+
+  // AUC with instance weights
+  EXPECT_NEAR(GetMetricEval(metric, {0.9f, 0.1f, 0.4f, 0.3f}, {0, 0, 1, 1},
+                            {1.0f, 3.0f, 2.0f, 4.0f}, {}, data_split_mode),
+              0.75f, 0.001f);
+
+  // regression test case
+  ASSERT_NEAR(GetMetricEval(metric, {0.79523796, 0.5201713,  0.79523796, 0.24273258, 0.53452194,
+                                     0.53452194, 0.24273258, 0.5201713,  0.79523796, 0.53452194,
+                                     0.24273258, 0.53452194, 0.79523796, 0.5201713,  0.24273258,
+                                     0.5201713,  0.5201713,  0.53452194, 0.5201713,  0.53452194},
+                            {0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0}, {}, {},
+                            data_split_mode),
+              0.5, 1e-10);
+}
+
+inline void VerifyMultiClassAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> uni_ptr{Metric::Create("auc", &ctx)};
+  auto metric = uni_ptr.get();
+
+  // MultiClass
+  // 3x3
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,  // p_0
+                                0.0f, 1.0f, 0.0f,  // p_1
+                                0.0f, 0.0f, 1.0f   // p_2
+                            },
+                            {0, 1, 2}, {}, {}, data_split_mode),
+              1.0f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,  // p_0
+                                0.0f, 1.0f, 0.0f,  // p_1
+                                0.0f, 0.0f, 1.0f   // p_2
+                            },
+                            {0, 1, 2}, {1.0f, 1.0f, 1.0f}, {}, data_split_mode),
+              1.0f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,  // p_0
+                                0.0f, 1.0f, 0.0f,  // p_1
+                                0.0f, 0.0f, 1.0f   // p_2
+                            },
+                            {2, 1, 0}, {}, {}, data_split_mode),
+              0.5f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,  // p_0
+                                0.0f, 1.0f, 0.0f,  // p_1
+                                0.0f, 0.0f, 1.0f   // p_2
+                            },
+                            {2, 0, 1}, {}, {}, data_split_mode),
+              0.25f, 1e-10);
+
+  // invalid dataset
+  float auc = GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f,                 // p_0
+                                0.0f, 1.0f, 0.0f,                 // p_1
+                                0.0f, 0.0f, 1.0f                  // p_2
+                            },
+                            {0, 1, 1}, {}, {}, data_split_mode);  // no class 2.
+  EXPECT_TRUE(std::isnan(auc)) << auc;
+
+  HostDeviceVector<float> predts{
+      0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f,
+  };
+  std::vector<float> labels{1.0f, 0.0f, 2.0f, 1.0f};
+  auc = GetMetricEval(metric, predts, labels, {1.0f, 2.0f, 3.0f, 4.0f}, {}, data_split_mode);
+  ASSERT_GT(auc, 0.714);
+}
+
+inline void VerifyRankingAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> metric{Metric::Create("auc", &ctx)};
+
+  // single group
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0.7f, 0.2f, 0.3f, 0.6f}, {1.0f, 0.8f, 0.4f, 0.2f},
+                            /*weights=*/{}, {0, 4}, data_split_mode),
+              0.5f, 1e-10);
+
+  // multi group
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2}, {0, 1, 2, 0, 1, 2}, /*weights=*/{},
+                            {0, 3, 6}, data_split_mode),
+              1.0f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2}, {0, 1, 2, 0, 1, 2},
+                            /*weights=*/{1.0f, 2.0f}, {0, 3, 6}, data_split_mode),
+              1.0f, 1e-10);
+
+  // AUC metric for grouped datasets - exception scenarios
+  ASSERT_TRUE(std::isnan(
+      GetMetricEval(metric.get(), {0, 1, 2}, {0, 0, 0}, {}, {0, 2, 3}, data_split_mode)));
+
+  // regression case
+  HostDeviceVector<float> predt{
+      0.33935383, 0.5149714,  0.32138085, 1.4547751, 1.2010975, 0.42651367, 0.23104341, 0.83610827,
+      0.8494239,  0.07136688, 0.5623144,  0.8086237, 1.5066161, -4.094787,  0.76887935, -2.4082742};
+  std::vector<bst_group_t> groups{0, 7, 16};
+  std::vector<float> labels{1., 0., 0., 1., 2., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.};
+
+  EXPECT_NEAR(GetMetricEval(metric.get(), std::move(predt), labels,
+                            /*weights=*/{}, groups, data_split_mode),
+              0.769841f, 1e-6);
+}
+
+inline void VerifyPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  xgboost::Metric* metric = xgboost::Metric::Create("aucpr", &ctx);
+  ASSERT_STREQ(metric->Name(), "aucpr");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 0, 1, 1}, {0, 0, 1, 1}, {}, {}, data_split_mode), 1, 1e-10);
+  EXPECT_NEAR(
+      GetMetricEval(metric, {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}, {}, {}, data_split_mode), 0.5f,
+      0.001f);
+  EXPECT_NEAR(GetMetricEval(metric, {0.4f, 0.2f, 0.9f, 0.1f, 0.2f, 0.4f, 0.1f, 0.1f, 0.2f, 0.1f},
+                            {0, 0, 0, 0, 0, 1, 0, 0, 1, 1}, {}, {}, data_split_mode),
+              0.2908445f, 0.001f);
+  EXPECT_NEAR(
+      GetMetricEval(metric, {0.87f, 0.31f, 0.40f, 0.42f, 0.25f, 0.66f, 0.95f, 0.09f, 0.10f, 0.97f,
+                             0.76f, 0.69f, 0.15f, 0.20f, 0.30f, 0.14f, 0.07f, 0.58f, 0.61f, 0.08f},
+                    {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}, {}, {},
+                    data_split_mode),
+      0.2769199f, 0.001f);
+  auto auc = GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode);
+  ASSERT_TRUE(std::isnan(auc));
+
+  // AUCPR with instance weights
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.29f, 0.52f, 0.11f, 0.21f, 0.219f, 0.93f, 0.493f, 0.17f, 0.47f, 0.13f,
+                             0.43f, 0.59f, 0.87f, 0.007f},
+                            {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0},
+                            {1, 2, 7, 4, 5, 2.2f, 3.2f, 5, 6, 1, 2, 1.1f, 3.2f, 4.5f},  // weights
+                            {}, data_split_mode),
+              0.694435f, 0.001f);
+
+  // Both groups contain only pos or neg samples.
+  auc = GetMetricEval(metric, {0, 0.1f, 0.3f, 0.5f, 0.7f}, {1, 1, 0, 0, 0}, {}, {0, 2, 5},
+                      data_split_mode);
+  ASSERT_TRUE(std::isnan(auc));
+  delete metric;
+}
+
+inline void VerifyMultiClassPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
+
+  float auc = 0;
+  std::vector<float> labels{1.0f, 0.0f, 2.0f};
+  HostDeviceVector<float> predts{
+      0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f,
+  };
+  auc = GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode);
+  EXPECT_EQ(auc, 1.0f);
+
+  auc = GetMetricEval(metric.get(), predts, labels, {1.0f, 1.0f, 1.0f}, {}, data_split_mode);
+  EXPECT_EQ(auc, 1.0f);
+
+  predts.HostVector() = {
+      0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f,
+  };
+  labels = {1.0f, 0.0f, 2.0f, 1.0f};
+  auc = GetMetricEval(metric.get(), predts, labels, {1.0f, 2.0f, 3.0f, 4.0f}, {}, data_split_mode);
+  ASSERT_GT(auc, 0.699);
+}
+
+inline void VerifyRankingPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
+
+  std::vector<float> labels{1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f};
+  std::vector<uint32_t> groups{0, 2, 6};
+
+  float auc = 0;
+  auc = GetMetricEval(metric.get(), {1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 1.0f}, labels, {}, groups,
+                      data_split_mode);
+  EXPECT_EQ(auc, 1.0f);
+
+  auc = GetMetricEval(metric.get(), {1.0f, 0.5f, 0.8f, 0.3f, 0.2f, 1.0f}, labels, {}, groups,
+                      data_split_mode);
+  EXPECT_EQ(auc, 1.0f);
+
+  auc = GetMetricEval(metric.get(), {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                      {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {}, groups, data_split_mode);
+  ASSERT_TRUE(std::isnan(auc));
+
+  // Incorrect label
+  ASSERT_THROW(GetMetricEval(metric.get(), {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                             {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 3.0f}, {}, groups, data_split_mode),
+               dmlc::Error);
+
+  // AUCPR with groups and no weights
+  EXPECT_NEAR(
+      GetMetricEval(metric.get(),
+                    {0.87f, 0.31f, 0.40f, 0.42f, 0.25f, 0.66f, 0.95f, 0.09f, 0.10f, 0.97f,
+                     0.76f, 0.69f, 0.15f, 0.20f, 0.30f, 0.14f, 0.07f, 0.58f, 0.61f, 0.08f},
+                    {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}, {},  // weights
+                    {0, 2, 5, 9, 14, 20},                                              // group info
+                    data_split_mode),
+      0.556021f, 0.001f);
+}
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index 9000cfc09..2407dde39 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -1,347 +1,108 @@
 /**
  * Copyright 2018-2023 by XGBoost contributors
  */
-#include <xgboost/json.h>
-#include <xgboost/metric.h>
-
-#include <map>
-#include <memory>
-
-#include "../../../src/common/linalg_op.h"
-#include "../helpers.h"
-
-namespace xgboost {
-namespace {
-inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
-  auto ctx = CreateEmptyGenericParam(device);
-  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
-
-  HostDeviceVector<float> predts;
-  size_t n_samples = 2048;
-
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.labels.Reshape(n_samples, 1);
-  info.num_row_ = n_samples;
-  auto &h_labels = info.labels.Data()->HostVector();
-  auto &h_predts = predts.HostVector();
-
-  SimpleLCG lcg;
-  SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
-
-  h_labels.resize(n_samples);
-  h_predts.resize(n_samples);
-
-  for (size_t i = 0; i < n_samples; ++i) {
-    h_predts[i] = dist(&lcg);
-    h_labels[i] = dist(&lcg);
-  }
-
-  auto result = metric->Evaluate(predts, p_fmat);
-  for (size_t i = 0; i < 8; ++i) {
-    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
-  }
-}
-}  // anonymous namespace
-}  // namespace xgboost
+#include "test_elementwise_metric.h"
 
 namespace xgboost {
 namespace metric {
+TEST(Metric, DeclareUnifiedTest(RMSE)) { VerifyRMSE(); }
 
-TEST(Metric, DeclareUnifiedTest(RMSE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("rmse", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "rmse");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.6403f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              2.8284f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.6708f, 0.001f);
-  delete metric;
+TEST(Metric, DeclareUnifiedTest(RMSLE)) { VerifyRMSLE(); }
 
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"rmse"}, GPUIDX);
+TEST(Metric, DeclareUnifiedTest(MAE)) { VerifyMAE(); }
+
+TEST(Metric, DeclareUnifiedTest(MAPE)) { VerifyMAPE(); }
+
+TEST(Metric, DeclareUnifiedTest(MPHE)) { VerifyMPHE(); }
+
+TEST(Metric, DeclareUnifiedTest(LogLoss)) { VerifyLogLoss(); }
+
+TEST(Metric, DeclareUnifiedTest(Error)) { VerifyError(); }
+
+TEST(Metric, DeclareUnifiedTest(PoissonNegLogLik)) { VerifyPoissonNegLogLik(); }
+
+TEST(Metric, DeclareUnifiedTest(MultiRMSE)) { VerifyMultiRMSE(); }
+
+TEST(Metric, DeclareUnifiedTest(Quantile)) { VerifyQuantile(); }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRMSE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(RMSLE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "rmsle");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
-                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}),
-              0.4063f, 1e-4);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
-                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
-                            {   0,   -1,    1,    -9,   9}),
-              0.6212f, 1e-4);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
-                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
-                            {   0,    1,    2,    9,    8}),
-              0.2415f, 1e-4);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"rmsle"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRMSE, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MAE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("mae", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "mae");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              8.0f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.54f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"mae"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSLERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRMSLE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(MAPE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("mape", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "mape");
-  EXPECT_NEAR(GetMetricEval(metric, {150, 300}, {100, 200}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {50, 400, 500, 4000},
-                            {100, 200, 500, 1000}),
-              1.125f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {50, 400, 500, 4000},
-                            {100, 200, 500, 1000},
-                            { -1,   1,   9,  -9}),
-              -26.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {50, 400, 500, 4000},
-                            {100, 200, 500, 1000},
-                            {  1,   2,   9,   8}),
-              1.3250f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"mape"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSLEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyRMSLE, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MPHE)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<xgboost::Metric> metric{xgboost::Metric::Create("mphe", &ctx)};
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "mphe");
-  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric.get(),
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.1751f, 1e-4);
-  EXPECT_NEAR(GetMetricEval(metric.get(),
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              3.4037f, 1e-4);
-  EXPECT_NEAR(GetMetricEval(metric.get(),
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.1922f, 1e-4);
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"mphe"}, GPUIDX);
-
-  metric->Configure({{"huber_slope", "0.1"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(),
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.0461686f, 1e-4);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(LogLoss)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("logloss", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "logloss");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.5f, 1e-17f, 1.0f+1e-17f, 0.9f},
-                            {   0,      0,           1,    1}),
-              0.1996f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              1.2039f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              21.9722f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              1.3138f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"logloss"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAE, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(Error)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("error", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "error");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                           {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              10.0f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.55f, 0.001f);
-
-  EXPECT_ANY_THROW(xgboost::Metric::Create("error@abc", &ctx));
-  delete metric;
-
-  metric = xgboost::Metric::Create("error@0.5f", &ctx);
-  metric->Configure({});
-  EXPECT_STREQ(metric->Name(), "error");
-
-  delete metric;
-
-  metric = xgboost::Metric::Create("error@0.1", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "error@0.1");
-  EXPECT_STREQ(metric->Name(), "error@0.1");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {-0.1f, -0.9f, 0.1f, 0.9f},
-                            {   0,    0,   1,   1}),
-              0.25f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {-0.1f, -0.9f, 0.1f, 0.9f},
-                            {   0,    0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              9.0f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {-0.1f, -0.9f, 0.1f, 0.9f},
-                            {   0,    0,   1,   1},
-                            {  1,   2,   9,   8}),
-              0.45f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"error@0.5"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAPE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(PoissionNegLogLik)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "poisson-nloglik");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0.5f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.5f, 1e-17f, 1.0f+1e-17f, 0.9f},
-                            {   0,      0,           1,    1}),
-              0.6263f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              1.1019f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            { -1,   1,   9,  -9}),
-              13.3750f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1},
-                            {  1,   2,   9,   8}),
-              1.5783f, 0.001f);
-  delete metric;
-
-  xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"poisson-nloglik"}, GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAPE, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MultiRMSE)) {
-  size_t n_samples = 32, n_targets = 8;
-  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
-  auto &h_y = y.Data()->HostVector();
-  std::iota(h_y.begin(), h_y.end(), 0);
-
-  HostDeviceVector<float> predt(n_samples * n_targets, 0);
-
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
-  metric->Configure({});
-
-  auto loss = GetMultiMetricEval(metric.get(), predt, y);
-  std::vector<float> weights(n_samples, 1);
-  auto loss_w = GetMultiMetricEval(metric.get(), predt, y, weights);
-
-  std::transform(h_y.cbegin(), h_y.cend(), h_y.begin(), [](auto &v) { return v * v; });
-  auto ret = std::sqrt(std::accumulate(h_y.cbegin(), h_y.cend(), 1.0, std::plus<>{}) / h_y.size());
-  ASSERT_FLOAT_EQ(ret, loss);
-  ASSERT_FLOAT_EQ(ret, loss_w);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MPHERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMPHE, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(Quantile)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Metric> metric{Metric::Create("quantile", &ctx)};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MPHEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMPHE, DataSplitMode::kCol);
+}
 
-  HostDeviceVector<float> predts{0.1f, 0.9f, 0.1f, 0.9f};
-  std::vector<float> labels{0.5f, 0.5f, 0.9f, 0.1f};
-  std::vector<float> weights{0.2f,  0.4f,0.6f, 0.8f};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), LogLossRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyLogLoss, DataSplitMode::kRow);
+}
 
-  metric->Configure(Args{{"quantile_alpha", "[0.0]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.400f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.2]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.376f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.4]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.352f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.8]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.304f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights), 0.28f, 0.001f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), LogLossColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyLogLoss, DataSplitMode::kCol);
+}
 
-  metric->Configure(Args{{"quantile_alpha", "[0.0]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.2]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.4]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[0.8]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
-  metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
-  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels), 0.3f, 0.001f);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), ErrorRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyError, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), ErrorColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyError, DataSplitMode::kCol);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PoissonNegLogLikRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPoissonNegLogLik, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PoissonNegLogLikColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPoissonNegLogLik, DataSplitMode::kCol);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiRMSERowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiRMSE, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiRMSEColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiRMSE, DataSplitMode::kCol);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyQuantile, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyQuantile, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h
new file mode 100644
index 000000000..1b06194fe
--- /dev/null
+++ b/tests/cpp/metric/test_elementwise_metric.h
@@ -0,0 +1,385 @@
+/**
+ * Copyright 2018-2023 by XGBoost contributors
+ */
+#pragma once
+#include <xgboost/json.h>
+#include <xgboost/metric.h>
+
+#include <map>
+#include <memory>
+
+#include "../../../src/common/linalg_op.h"
+#include "../helpers.h"
+
+namespace xgboost {
+namespace metric {
+
+inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
+  auto ctx = CreateEmptyGenericParam(device);
+  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
+
+  HostDeviceVector<float> predts;
+  size_t n_samples = 2048;
+
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.labels.Reshape(n_samples, 1);
+  info.num_row_ = n_samples;
+  auto &h_labels = info.labels.Data()->HostVector();
+  auto &h_predts = predts.HostVector();
+
+  SimpleLCG lcg;
+  SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
+
+  h_labels.resize(n_samples);
+  h_predts.resize(n_samples);
+
+  for (size_t i = 0; i < n_samples; ++i) {
+    h_predts[i] = dist(&lcg);
+    h_labels[i] = dist(&lcg);
+  }
+
+  auto result = metric->Evaluate(predts, p_fmat);
+  for (size_t i = 0; i < 8; ++i) {
+    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
+  }
+}
+
+inline void VerifyRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("rmse", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "rmse");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.6403f, 0.001f);
+  auto expected = 2.8284f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected = sqrt(8.0f * collective::GetWorldSize());
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.6708f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"rmse"}, GPUIDX);
+}
+
+inline void VerifyRMSLE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "rmsle");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
+                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, {}, {}, data_split_mode),
+              0.4063f, 1e-4);
+  auto expected = 0.6212f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected = sqrt(0.3859f * collective::GetWorldSize());
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
+                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+                            {   0,   -1,    1,    -9,   9}, {}, data_split_mode),
+              expected, 1e-4);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.2f, 0.4f, 0.8f, 1.6f},
+                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+                            {   0,    1,    2,    9,    8}, {}, data_split_mode),
+              0.2415f, 1e-4);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"rmsle"}, GPUIDX);
+}
+
+inline void VerifyMAE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("mae", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "mae");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.5f, 0.001f);
+  auto expected = 8.0f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.54f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"mae"}, GPUIDX);
+}
+
+inline void VerifyMAPE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("mape", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "mape");
+  EXPECT_NEAR(GetMetricEval(metric, {150, 300}, {100, 200}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {50, 400, 500, 4000},
+                            {100, 200, 500, 1000}, {}, {}, data_split_mode),
+              1.125f, 0.001f);
+  auto expected = -26.5f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {50, 400, 500, 4000},
+                            {100, 200, 500, 1000},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {50, 400, 500, 4000},
+                            {100, 200, 500, 1000},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              1.3250f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"mape"}, GPUIDX);
+}
+
+inline void VerifyMPHE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<xgboost::Metric> metric{xgboost::Metric::Create("mphe", &ctx)};
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "mphe");
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric.get(),
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.1751f, 1e-4);
+  auto expected = 3.40375f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric.get(),
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 1e-4);
+  EXPECT_NEAR(GetMetricEval(metric.get(),
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.1922f, 1e-4);
+
+  CheckDeterministicMetricElementWise(StringView{"mphe"}, GPUIDX);
+
+  metric->Configure({{"huber_slope", "0.1"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(),
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.0461686f, 1e-4);
+}
+
+inline void VerifyLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("logloss", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "logloss");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.5f, 1e-17f, 1.0f+1e-17f, 0.9f},
+                            {   0,      0,           1,    1}, {}, {}, data_split_mode),
+              0.1996f, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              1.2039f, 0.001f);
+  auto expected = 21.9722f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              1.3138f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"logloss"}, GPUIDX);
+}
+
+inline void VerifyError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("error", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "error");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.5f, 0.001f);
+  auto expected = 10.0f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                           {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.55f, 0.001f);
+
+  EXPECT_ANY_THROW(xgboost::Metric::Create("error@abc", &ctx));
+  delete metric;
+
+  metric = xgboost::Metric::Create("error@0.5f", &ctx);
+  metric->Configure({});
+  EXPECT_STREQ(metric->Name(), "error");
+
+  delete metric;
+
+  metric = xgboost::Metric::Create("error@0.1", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "error@0.1");
+  EXPECT_STREQ(metric->Name(), "error@0.1");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {-0.1f, -0.9f, 0.1f, 0.9f},
+                            {   0,    0,   1,   1}, {}, {}, data_split_mode),
+              0.25f, 0.001f);
+  expected = 9.0f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {-0.1f, -0.9f, 0.1f, 0.9f},
+                            {   0,    0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {-0.1f, -0.9f, 0.1f, 0.9f},
+                            {   0,    0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              0.45f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"error@0.5"}, GPUIDX);
+}
+
+inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "poisson-nloglik");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.5f, 1e-17f, 1.0f+1e-17f, 0.9f},
+                            {   0,      0,           1,    1}, {}, {}, data_split_mode),
+              0.6263f, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              1.1019f, 0.001f);
+  auto expected = 13.3750f;
+  if (collective::IsDistributed() && data_split_mode == DataSplitMode::kRow) {
+    expected *= collective::GetWorldSize();
+  }
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            { -1,   1,   9,  -9}, {}, data_split_mode),
+              expected, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1},
+                            {  1,   2,   9,   8}, {}, data_split_mode),
+              1.5783f, 0.001f);
+  delete metric;
+
+  CheckDeterministicMetricElementWise(StringView{"poisson-nloglik"}, GPUIDX);
+}
+
+inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  size_t n_samples = 32, n_targets = 8;
+  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
+  auto &h_y = y.Data()->HostVector();
+  std::iota(h_y.begin(), h_y.end(), 0);
+
+  HostDeviceVector<float> predt(n_samples * n_targets, 0);
+
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
+  metric->Configure({});
+
+  auto loss = GetMultiMetricEval(metric.get(), predt, y, {}, {}, data_split_mode);
+  std::vector<float> weights(n_samples, 1);
+  auto loss_w = GetMultiMetricEval(metric.get(), predt, y, weights, {}, data_split_mode);
+
+  std::transform(h_y.cbegin(), h_y.cend(), h_y.begin(), [](auto &v) { return v * v; });
+  auto ret = std::sqrt(std::accumulate(h_y.cbegin(), h_y.cend(), 1.0, std::plus<>{}) / h_y.size());
+  ASSERT_FLOAT_EQ(ret, loss);
+  ASSERT_FLOAT_EQ(ret, loss_w);
+}
+
+inline void VerifyQuantile(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> metric{Metric::Create("quantile", &ctx)};
+
+  HostDeviceVector<float> predts{0.1f, 0.9f, 0.1f, 0.9f};
+  std::vector<float> labels{0.5f, 0.5f, 0.9f, 0.1f};
+  std::vector<float> weights{0.2f, 0.4f, 0.6f, 0.8f};
+
+  metric->Configure(Args{{"quantile_alpha", "[0.0]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.400f,
+              0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.2]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.376f,
+              0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.4]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.352f,
+              0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.8]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.304f,
+              0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, weights, {}, data_split_mode), 0.28f,
+              0.001f);
+
+  metric->Configure(Args{{"quantile_alpha", "[0.0]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.2]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.4]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[0.8]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+  metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
+  EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
+}
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/metric/test_multiclass_metric.cc b/tests/cpp/metric/test_multiclass_metric.cc
index 2465b11c8..bfb638924 100644
--- a/tests/cpp/metric/test_multiclass_metric.cc
+++ b/tests/cpp/metric/test_multiclass_metric.cc
@@ -1,87 +1,29 @@
 // Copyright by Contributors
-#include <xgboost/metric.h>
+#include "test_multiclass_metric.h"
+
 #include <string>
 
-#include "../helpers.h"
-
 namespace xgboost {
-inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device) {
-  auto ctx = CreateEmptyGenericParam(device);
-  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
+namespace metric {
 
-  HostDeviceVector<float> predts;
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  auto &h_predts = predts.HostVector();
+TEST(Metric, DeclareUnifiedTest(MultiClassError)) { VerifyMultiClassError(); }
 
-  SimpleLCG lcg;
+TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) { VerifyMultiClassLogLoss(); }
 
-  size_t n_samples = 2048, n_classes = 4;
-
-  info.labels.Reshape(n_samples);
-  auto &h_labels = info.labels.Data()->HostVector();
-  h_predts.resize(n_samples * n_classes);
-
-  {
-    SimpleRealUniformDistribution<float> dist{0.0f, static_cast<float>(n_classes)};
-    for (size_t i = 0; i < n_samples; ++i) {
-      h_labels[i] = dist(&lcg);
-    }
-  }
-
-  {
-    SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
-    for (size_t i = 0; i < n_samples * n_classes; ++i) {
-      h_predts[i] = dist(&lcg);
-    }
-  }
-
-  auto result = metric->Evaluate(predts, p_fmat);
-  for (size_t i = 0; i < 8; ++i) {
-    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
-  }
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassErrorRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassError, DataSplitMode::kRow);
 }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassErrorColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassError, DataSplitMode::kCol);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassLogLossRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassLogLoss, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassLogLossColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassLogLoss, DataSplitMode::kCol);
+}
+}  // namespace metric
 }  // namespace xgboost
-
-inline void TestMultiClassError(int device) {
-  auto ctx = xgboost::CreateEmptyGenericParam(device);
-  ctx.gpu_id = device;
-  xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "merror");
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0}, {0, 0}));
-  EXPECT_NEAR(GetMetricEval(
-      metric, {1, 0, 0, 0, 1, 0, 0, 0, 1}, {0, 1, 2}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f},
-                            {0, 1, 2}),
-              0.666f, 0.001f);
-  delete metric;
-}
-
-TEST(Metric, DeclareUnifiedTest(MultiClassError)) {
-  TestMultiClassError(GPUIDX);
-  xgboost::CheckDeterministicMetricMultiClass(xgboost::StringView{"merror"}, GPUIDX);
-}
-
-inline void TestMultiClassLogLoss(int device) {
-  auto ctx = xgboost::CreateEmptyGenericParam(device);
-  ctx.gpu_id = device;
-  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
-  metric->Configure({});
-  ASSERT_STREQ(metric->Name(), "mlogloss");
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0}, {0, 0}));
-  EXPECT_NEAR(GetMetricEval(
-    metric, {1, 0, 0, 0, 1, 0, 0, 0, 1}, {0, 1, 2}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f},
-                            {0, 1, 2}),
-              2.302f, 0.001f);
-
-  delete metric;
-}
-
-TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) {
-  TestMultiClassLogLoss(GPUIDX);
-  xgboost::CheckDeterministicMetricMultiClass(xgboost::StringView{"mlogloss"}, GPUIDX);
-}
diff --git a/tests/cpp/metric/test_multiclass_metric.h b/tests/cpp/metric/test_multiclass_metric.h
new file mode 100644
index 000000000..cd2b142fc
--- /dev/null
+++ b/tests/cpp/metric/test_multiclass_metric.h
@@ -0,0 +1,91 @@
+// Copyright by Contributors
+#include <xgboost/metric.h>
+#include <string>
+
+#include "../helpers.h"
+
+namespace xgboost {
+namespace metric {
+
+inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device) {
+  auto ctx = CreateEmptyGenericParam(device);
+  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
+
+  HostDeviceVector<float> predts;
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  auto &h_predts = predts.HostVector();
+
+  SimpleLCG lcg;
+
+  size_t n_samples = 2048, n_classes = 4;
+
+  info.labels.Reshape(n_samples);
+  auto &h_labels = info.labels.Data()->HostVector();
+  h_predts.resize(n_samples * n_classes);
+
+  {
+    SimpleRealUniformDistribution<float> dist{0.0f, static_cast<float>(n_classes)};
+    for (size_t i = 0; i < n_samples; ++i) {
+      h_labels[i] = dist(&lcg);
+    }
+  }
+
+  {
+    SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
+    for (size_t i = 0; i < n_samples * n_classes; ++i) {
+      h_predts[i] = dist(&lcg);
+    }
+  }
+
+  auto result = metric->Evaluate(predts, p_fmat);
+  for (size_t i = 0; i < 8; ++i) {
+    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
+  }
+}
+
+inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
+  auto ctx = xgboost::CreateEmptyGenericParam(device);
+  ctx.gpu_id = device;
+  xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "merror");
+  EXPECT_ANY_THROW(GetMetricEval(metric, {0}, {0, 0}, {}, {}, data_split_mode));
+  EXPECT_NEAR(GetMetricEval(
+      metric, {1, 0, 0, 0, 1, 0, 0, 0, 1}, {0, 1, 2}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f},
+                            {0, 1, 2}, {}, {}, data_split_mode),
+              0.666f, 0.001f);
+  delete metric;
+}
+
+inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  TestMultiClassError(GPUIDX, data_split_mode);
+  CheckDeterministicMetricMultiClass(StringView{"merror"}, GPUIDX);
+}
+
+inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
+  auto ctx = xgboost::CreateEmptyGenericParam(device);
+  ctx.gpu_id = device;
+  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
+  metric->Configure({});
+  ASSERT_STREQ(metric->Name(), "mlogloss");
+  EXPECT_ANY_THROW(GetMetricEval(metric, {0}, {0, 0}, {}, {}, data_split_mode));
+  EXPECT_NEAR(GetMetricEval(
+    metric, {1, 0, 0, 0, 1, 0, 0, 0, 1}, {0, 1, 2}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f},
+                            {0, 1, 2}, {}, {}, data_split_mode),
+              2.302f, 0.001f);
+
+  delete metric;
+}
+
+inline void VerifyMultiClassLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  TestMultiClassLogLoss(GPUIDX, data_split_mode);
+  CheckDeterministicMetricMultiClass(StringView{"mlogloss"}, GPUIDX);
+}
+
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index 3e1028c48..430671305 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -11,16 +11,20 @@
 #include <memory>                        // for unique_ptr
 #include <vector>                        // for vector
 
+#include "test_rank_metric.h"
 #include "../helpers.h"                  // for GetMetricEval, CreateEmptyGe...
 #include "xgboost/base.h"                // for bst_float, kRtEps
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/json.h"                // for Json, String, Object
 
+namespace xgboost {
+namespace metric {
+
 #if !defined(__CUDACC__)
 TEST(Metric, AMS) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  EXPECT_ANY_THROW(xgboost::Metric::Create("ams", &ctx));
-  xgboost::Metric* metric = xgboost::Metric::Create("ams@0.5f", &ctx);
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  EXPECT_ANY_THROW(Metric::Create("ams", &ctx));
+  Metric* metric = Metric::Create("ams@0.5f", &ctx);
   ASSERT_STREQ(metric->Name(), "ams@0.5");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0.311f, 0.001f);
   EXPECT_NEAR(GetMetricEval(metric,
@@ -29,7 +33,7 @@ TEST(Metric, AMS) {
               0.29710f, 0.001f);
 
   delete metric;
-  metric = xgboost::Metric::Create("ams@0", &ctx);
+  metric = Metric::Create("ams@0", &ctx);
   ASSERT_STREQ(metric->Name(), "ams@0");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0.311f, 0.001f);
 
@@ -37,172 +41,44 @@ TEST(Metric, AMS) {
 }
 #endif
 
-TEST(Metric, DeclareUnifiedTest(Precision)) {
-  // When the limit for precision is not given, it takes the limit at
-  // std::numeric_limits<unsigned>::max(); hence all values are very small
-  // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("pre", &ctx);
-  ASSERT_STREQ(metric->Name(), "pre");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-7);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0, 1e-7);
+TEST(Metric, DeclareUnifiedTest(Precision)) { VerifyPrecision(); }
 
-  delete metric;
-  metric = xgboost::Metric::Create("pre@2", &ctx);
-  ASSERT_STREQ(metric->Name(), "pre@2");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0.5f, 1e-7);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
+TEST(Metric, DeclareUnifiedTest(NDCG)) { VerifyNDCG(); }
 
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
+TEST(Metric, DeclareUnifiedTest(MAP)) { VerifyMAP(); }
 
-  delete metric;
+TEST(Metric, DeclareUnifiedTest(NDCGExpGain)) { VerifyNDCGExpGain(); }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PrecisionRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPrecision, DataSplitMode::kRow);
 }
 
-namespace xgboost {
-namespace metric {
-TEST(Metric, DeclareUnifiedTest(NDCG)) {
-  auto ctx = CreateEmptyGenericParam(GPUIDX);
-  Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg");
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
-  ASSERT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 1, 1e-10);
-  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.6509f, 0.001f);
-
-  delete metric;
-  metric = xgboost::Metric::Create("ndcg@2", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg@2");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.3868f, 0.001f);
-
-  delete metric;
-  metric = xgboost::Metric::Create("ndcg@-", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg-");
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 0, 1e-10);
-  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.6509f, 0.001f);
-  delete metric;
-  metric = xgboost::Metric::Create("ndcg-", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg-");
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 0, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-               0.6509f, 0.001f);
-
-  delete metric;
-  metric = xgboost::Metric::Create("ndcg@2-", &ctx);
-  ASSERT_STREQ(metric->Name(), "ndcg@2-");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.f, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              1.f - 0.3868f, 1.f - 0.001f);
-
-  delete metric;
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), PrecisionColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyPrecision, DataSplitMode::kCol);
 }
 
-TEST(Metric, DeclareUnifiedTest(MAP)) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  Metric * metric = xgboost::Metric::Create("map", &ctx);
-  ASSERT_STREQ(metric->Name(), "map");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, kRtEps);
-
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            std::vector<xgboost::bst_float>{}), 1, 1e-10);
-
-  // Rank metric with group info
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.2f, 0.8f, 0.4f, 1.7f},
-                            {1, 1, 1, 0, 1, 0},  // Labels
-                            {},  // Weights
-                            {0, 2, 5, 6}),  // Group info
-              0.8611f, 0.001f);
-
-  delete metric;
-  metric = xgboost::Metric::Create("map@-", &ctx);
-  ASSERT_STREQ(metric->Name(), "map-");
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 0, 1e-10);
-
-  delete metric;
-  metric = xgboost::Metric::Create("map-", &ctx);
-  ASSERT_STREQ(metric->Name(), "map-");
-  EXPECT_NEAR(GetMetricEval(metric,
-                            xgboost::HostDeviceVector<xgboost::bst_float>{},
-                            {}), 0, 1e-10);
-
-  delete metric;
-  metric = xgboost::Metric::Create("map@2", &ctx);
-  ASSERT_STREQ(metric->Name(), "map@2");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.25f, 0.001f);
-  delete metric;
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyNDCG, DataSplitMode::kRow);
 }
 
-TEST(Metric, DeclareUnifiedTest(NDCGExpGain)) {
-  Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyNDCG, DataSplitMode::kCol);
+}
 
-  auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
-  info.num_row_ = info.labels.Shape(0);
-  info.group_ptr_.resize(2);
-  info.group_ptr_[0] = 0;
-  info.group_ptr_[1] = info.num_row_;
-  HostDeviceVector<float> predt{{0.1f, 0.2f, 0.3f, 4.0f, 70.0f}};
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAP, DataSplitMode::kRow);
+}
 
-  std::unique_ptr<Metric> metric{Metric::Create("ndcg", &ctx)};
-  Json config{Object{}};
-  config["name"] = String{"ndcg"};
-  config["lambdarank_param"] = Object{};
-  config["lambdarank_param"]["ndcg_exp_gain"] = String{"true"};
-  config["lambdarank_param"]["lambdarank_num_pair_per_sample"] = String{"32"};
-  metric->LoadConfig(config);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyMAP, DataSplitMode::kCol);
+}
 
-  auto ndcg = metric->Evaluate(predt, p_fmat);
-  ASSERT_NEAR(ndcg, 0.409738f, kRtEps);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGExpGainRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyNDCGExpGain, DataSplitMode::kRow);
+}
 
-  config["lambdarank_param"]["ndcg_exp_gain"] = String{"false"};
-  metric->LoadConfig(config);
-
-  ndcg = metric->Evaluate(predt, p_fmat);
-  ASSERT_NEAR(ndcg, 0.695694f, kRtEps);
-
-  predt.HostVector() = info.labels.Data()->HostVector();
-  ndcg = metric->Evaluate(predt, p_fmat);
-  ASSERT_NEAR(ndcg, 1.0, kRtEps);
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGExpGainColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyNDCGExpGain, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_rank_metric.h b/tests/cpp/metric/test_rank_metric.h
new file mode 100644
index 000000000..318de961b
--- /dev/null
+++ b/tests/cpp/metric/test_rank_metric.h
@@ -0,0 +1,191 @@
+/**
+ * Copyright 2016-2023 by XGBoost Contributors
+ */
+#pragma once
+#include <gtest/gtest.h>                 // for Test, EXPECT_NEAR, ASSERT_STREQ
+#include <xgboost/context.h>             // for Context
+#include <xgboost/data.h>                // for MetaInfo, DMatrix
+#include <xgboost/linalg.h>              // for Matrix
+#include <xgboost/metric.h>              // for Metric
+
+#include <algorithm>                     // for max
+#include <memory>                        // for unique_ptr
+#include <vector>                        // for vector
+
+#include "../helpers.h"                  // for GetMetricEval, CreateEmptyGe...
+#include "xgboost/base.h"                // for bst_float, kRtEps
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/json.h"                // for Json, String, Object
+
+namespace xgboost {
+namespace metric {
+
+inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  // When the limit for precision is not given, it takes the limit at
+  // std::numeric_limits<unsigned>::max(); hence all values are very small
+  // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  xgboost::Metric * metric = xgboost::Metric::Create("pre", &ctx);
+  ASSERT_STREQ(metric->Name(), "pre");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-7);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0, 1e-7);
+
+  delete metric;
+  metric = xgboost::Metric::Create("pre@2", &ctx);
+  ASSERT_STREQ(metric->Name(), "pre@2");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-7);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.5f, 0.001f);
+
+  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
+
+  delete metric;
+}
+
+inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = CreateEmptyGenericParam(GPUIDX);
+  Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg");
+  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
+  ASSERT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 1, 1e-10);
+  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.6509f, 0.001f);
+
+  delete metric;
+  metric = xgboost::Metric::Create("ndcg@2", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg@2");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.3868f, 0.001f);
+
+  delete metric;
+  metric = xgboost::Metric::Create("ndcg@-", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg-");
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 0, 1e-10);
+  ASSERT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1.f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.6509f, 0.001f);
+  delete metric;
+  metric = xgboost::Metric::Create("ndcg-", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg-");
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 0, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1.f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+               0.6509f, 0.001f);
+
+  delete metric;
+  metric = xgboost::Metric::Create("ndcg@2-", &ctx);
+  ASSERT_STREQ(metric->Name(), "ndcg@2-");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1.f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              1.f - 0.3868f, 1.f - 0.001f);
+
+  delete metric;
+}
+
+inline void VerifyMAP(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+  Metric * metric = xgboost::Metric::Create("map", &ctx);
+  ASSERT_STREQ(metric->Name(), "map");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, kRtEps);
+
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.5f, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            std::vector<xgboost::bst_float>{}, {}, {}, data_split_mode), 1, 1e-10);
+
+  // Rank metric with group info
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.2f, 0.8f, 0.4f, 1.7f},
+                            {1, 1, 1, 0, 1, 0},  // Labels
+                            {},  // Weights
+                            {0, 2, 5, 6},  // Group info
+                            data_split_mode),
+              0.8611f, 0.001f);
+
+  delete metric;
+  metric = xgboost::Metric::Create("map@-", &ctx);
+  ASSERT_STREQ(metric->Name(), "map-");
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 0, 1e-10);
+
+  delete metric;
+  metric = xgboost::Metric::Create("map-", &ctx);
+  ASSERT_STREQ(metric->Name(), "map-");
+  EXPECT_NEAR(GetMetricEval(metric,
+                            xgboost::HostDeviceVector<xgboost::bst_float>{},
+                            {}, {}, {}, data_split_mode), 0, 1e-10);
+
+  delete metric;
+  metric = xgboost::Metric::Create("map@2", &ctx);
+  ASSERT_STREQ(metric->Name(), "map@2");
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.1f, 0.9f, 0.1f, 0.9f},
+                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
+              0.25f, 0.001f);
+  delete metric;
+}
+
+inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
+  info.num_row_ = info.labels.Shape(0);
+  info.group_ptr_.resize(2);
+  info.group_ptr_[0] = 0;
+  info.group_ptr_[1] = info.num_row_;
+  info.data_split_mode = data_split_mode;
+  HostDeviceVector<float> predt{{0.1f, 0.2f, 0.3f, 4.0f, 70.0f}};
+
+  std::unique_ptr<Metric> metric{Metric::Create("ndcg", &ctx)};
+  Json config{Object{}};
+  config["name"] = String{"ndcg"};
+  config["lambdarank_param"] = Object{};
+  config["lambdarank_param"]["ndcg_exp_gain"] = String{"true"};
+  config["lambdarank_param"]["lambdarank_num_pair_per_sample"] = String{"32"};
+  metric->LoadConfig(config);
+
+  auto ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 0.409738f, kRtEps);
+
+  config["lambdarank_param"]["ndcg_exp_gain"] = String{"false"};
+  metric->LoadConfig(config);
+
+  ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 0.695694f, kRtEps);
+
+  predt.HostVector() = info.labels.Data()->HostVector();
+  ndcg = metric->Evaluate(predt, p_fmat);
+  ASSERT_NEAR(ndcg, 1.0, kRtEps);
+}
+}  // namespace metric
+}  // namespace xgboost
diff --git a/tests/cpp/metric/test_survival_metric.cu b/tests/cpp/metric/test_survival_metric.cu
index 80d6b72e6..d7ac54860 100644
--- a/tests/cpp/metric/test_survival_metric.cu
+++ b/tests/cpp/metric/test_survival_metric.cu
@@ -46,9 +46,8 @@ inline void CheckDeterministicMetricElementWise(StringView name, int32_t device)
     ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
   }
 }
-}  // anonymous namespace
 
-TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) {
+void VerifyAFTNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
   auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
 
   /**
@@ -59,10 +58,11 @@ TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) {
   MetaInfo& info = p_fmat->Info();
   info.num_row_ = 4;
   info.labels_lower_bound_.HostVector()
-    = { 100.0f, 0.0f, 60.0f, 16.0f };
+      = { 100.0f, 0.0f, 60.0f, 16.0f };
   info.labels_upper_bound_.HostVector()
-    = { 100.0f, 20.0f, std::numeric_limits<bst_float>::infinity(), 200.0f };
+      = { 100.0f, 20.0f, std::numeric_limits<bst_float>::infinity(), 200.0f };
   info.weights_.HostVector() = std::vector<bst_float>();
+  info.data_split_mode = data_split_mode;
   HostDeviceVector<bst_float> preds(4, std::log(64));
 
   struct TestCase {
@@ -70,15 +70,15 @@ TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) {
     bst_float reference_value;
   };
   for (const auto& test_case : std::vector<TestCase>{ {"normal", 2.1508f}, {"logistic", 2.1804f},
-                                                      {"extreme", 2.0706f} }) {
+                                                     {"extreme", 2.0706f} }) {
     std::unique_ptr<Metric> metric(Metric::Create("aft-nloglik", &ctx));
     metric->Configure({ {"aft_loss_distribution", test_case.dist_type},
-                        {"aft_loss_distribution_scale", "1.0"} });
+                       {"aft_loss_distribution_scale", "1.0"} });
     EXPECT_NEAR(metric->Evaluate(preds, p_fmat), test_case.reference_value, 1e-4);
   }
 }
 
-TEST(Metric, DeclareUnifiedTest(IntervalRegressionAccuracy)) {
+void VerifyIntervalRegressionAccuracy(DataSplitMode data_split_mode = DataSplitMode::kRow) {
   auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
 
   auto p_fmat = EmptyDMatrix();
@@ -87,6 +87,7 @@ TEST(Metric, DeclareUnifiedTest(IntervalRegressionAccuracy)) {
   info.labels_lower_bound_.HostVector() = { 20.0f, 0.0f, 60.0f, 16.0f };
   info.labels_upper_bound_.HostVector() = { 80.0f, 20.0f, 80.0f, 200.0f };
   info.weights_.HostVector() = std::vector<bst_float>();
+  info.data_split_mode = data_split_mode;
   HostDeviceVector<bst_float> preds(4, std::log(60.0f));
 
   std::unique_ptr<Metric> metric(Metric::Create("interval-regression-accuracy", &ctx));
@@ -102,6 +103,27 @@ TEST(Metric, DeclareUnifiedTest(IntervalRegressionAccuracy)) {
 
   CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GPUIDX);
 }
+}  // anonymous namespace
+
+TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) { VerifyAFTNegLogLik(); }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyAFTNegLogLik, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyAFTNegLogLik, DataSplitMode::kCol);
+}
+
+TEST(Metric, DeclareUnifiedTest(IntervalRegressionAccuracy)) { VerifyIntervalRegressionAccuracy(); }
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyRowSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyIntervalRegressionAccuracy, DataSplitMode::kRow);
+}
+
+TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, &VerifyIntervalRegressionAccuracy, DataSplitMode::kCol);
+}
 
 // Test configuration of AFT metric
 TEST(AFTNegLogLikMetric, DeclareUnifiedTest(Configuration)) {

From ef13dd31b1dc073f4732544739db5a252ae5b6e5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 18 Apr 2023 21:16:06 +0800
Subject: [PATCH 02/34] Rework the NDCG objective. (#9015)

---
 R-package/src/Makevars.in                   |   1 +
 R-package/src/Makevars.win                  |   1 +
 src/common/math.h                           |  31 +-
 src/common/ranking_utils.h                  |   6 +-
 src/objective/lambdarank_obj.cc             | 440 ++++++++++++++++++++
 src/objective/lambdarank_obj.cu             | 417 +++++++++++++++++++
 src/objective/lambdarank_obj.h              |  84 ++--
 src/objective/objective.cc                  |   3 +
 src/objective/rank_obj.cu                   | 172 --------
 tests/cpp/objective/test_lambdarank_obj.cc  | 122 +++++-
 tests/cpp/objective/test_lambdarank_obj.cu  |  18 +
 tests/cpp/objective/test_lambdarank_obj.h   |  21 +-
 tests/cpp/objective/test_ranking_obj.cc     |  45 --
 tests/cpp/objective/test_ranking_obj_gpu.cu |  56 ---
 tests/python-gpu/test_gpu_eval_metrics.py   |  16 +-
 15 files changed, 1082 insertions(+), 351 deletions(-)
 create mode 100644 src/objective/lambdarank_obj.cc

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 743bf0a66..04f0a74a5 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -33,6 +33,7 @@ OBJECTS= \
     $(PKGROOT)/src/objective/regression_obj.o \
     $(PKGROOT)/src/objective/multiclass_obj.o \
     $(PKGROOT)/src/objective/rank_obj.o \
+    $(PKGROOT)/src/objective/lambdarank_obj.o \
     $(PKGROOT)/src/objective/hinge.o \
     $(PKGROOT)/src/objective/aft_obj.o \
     $(PKGROOT)/src/objective/adaptive.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index a32d2fd2e..969cb7ff4 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -33,6 +33,7 @@ OBJECTS= \
     $(PKGROOT)/src/objective/regression_obj.o \
     $(PKGROOT)/src/objective/multiclass_obj.o \
     $(PKGROOT)/src/objective/rank_obj.o \
+    $(PKGROOT)/src/objective/lambdarank_obj.o \
     $(PKGROOT)/src/objective/hinge.o \
     $(PKGROOT)/src/objective/aft_obj.o \
     $(PKGROOT)/src/objective/adaptive.o \
diff --git a/src/common/math.h b/src/common/math.h
index 71a494544..c4d794b5d 100644
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015 by Contributors
+/**
+ * Copyright 2015-2023 by XGBoost Contributors
  * \file math.h
  * \brief additional math utils
  * \author Tianqi Chen
@@ -7,16 +7,19 @@
 #ifndef XGBOOST_COMMON_MATH_H_
 #define XGBOOST_COMMON_MATH_H_
 
-#include <xgboost/base.h>
+#include <xgboost/base.h>  // for XGBOOST_DEVICE
 
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <utility>
-#include <vector>
+#include <algorithm>    // for max
+#include <cmath>        // for exp, abs, log, lgamma
+#include <limits>       // for numeric_limits
+#include <type_traits>  // for is_floating_point, conditional, is_signed, is_same, declval, enable_if
+#include <utility>      // for pair
 
 namespace xgboost {
 namespace common {
+
+template <typename T> XGBOOST_DEVICE T Sqr(T const &w) { return w * w; }
+
 /*!
  * \brief calculate the sigmoid of the input.
  * \param x input parameter
@@ -30,9 +33,11 @@ XGBOOST_DEVICE inline float Sigmoid(float x) {
   return y;
 }
 
-template <typename T>
-XGBOOST_DEVICE inline static T Sqr(T a) { return a * a; }
-
+XGBOOST_DEVICE inline double Sigmoid(double x) {
+  auto denom = std::exp(-x) + 1.0;
+  auto y = 1.0 / denom;
+  return y;
+}
 /*!
  * \brief Equality test for both integer and floating point.
  */
@@ -134,10 +139,6 @@ inline static bool CmpFirst(const std::pair<float, unsigned> &a,
                             const std::pair<float, unsigned> &b) {
   return a.first > b.first;
 }
-inline static bool CmpSecond(const std::pair<float, unsigned> &a,
-                             const std::pair<float, unsigned> &b) {
-  return a.second > b.second;
-}
 
 // Redefined here to workaround a VC bug that doesn't support overloading for integer
 // types.
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index bc071c2d6..dd823a0d6 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -70,7 +70,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
   // pairs
   // should be accessed by getter for auto configuration.
   // nolint so that we can keep the string name.
-  PairMethod lambdarank_pair_method{PairMethod::kMean};  // NOLINT
+  PairMethod lambdarank_pair_method{PairMethod::kTopK};  // NOLINT
   std::size_t lambdarank_num_pair_per_sample{NotSet()};  // NOLINT
 
  public:
@@ -78,7 +78,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
 
   // unbiased
   bool lambdarank_unbiased{false};
-  double lambdarank_bias_norm{2.0};
+  double lambdarank_bias_norm{1.0};
   // ndcg
   bool ndcg_exp_gain{true};
 
@@ -135,7 +135,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
         .set_default(false)
         .describe("Unbiased lambda mart. Use extended IPW to debias click position");
     DMLC_DECLARE_FIELD(lambdarank_bias_norm)
-        .set_default(2.0)
+        .set_default(1.0)
         .set_lower_bound(0.0)
         .describe("Lp regularization for unbiased lambdarank.");
     DMLC_DECLARE_FIELD(ndcg_exp_gain)
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
new file mode 100644
index 000000000..30957f81a
--- /dev/null
+++ b/src/objective/lambdarank_obj.cc
@@ -0,0 +1,440 @@
+/**
+ * Copyright (c) 2023, XGBoost contributors
+ */
+#include "lambdarank_obj.h"
+
+#include <dmlc/registry.h>                 // for DMLC_REGISTRY_FILE_TAG
+
+#include <algorithm>                       // for transform, copy, fill_n, min, max
+#include <cmath>                           // for pow, log2
+#include <cstddef>                         // for size_t
+#include <cstdint>                         // for int32_t
+#include <map>                             // for operator!=
+#include <memory>                          // for shared_ptr, __shared_ptr_access, allocator
+#include <ostream>                         // for operator<<, basic_ostream
+#include <string>                          // for char_traits, operator<, basic_string, string
+#include <tuple>                           // for apply, make_tuple
+#include <type_traits>                     // for is_floating_point
+#include <utility>                         // for pair, swap
+#include <vector>                          // for vector
+
+#include "../common/error_msg.h"           // for GroupWeight, LabelScoreSize
+#include "../common/linalg_op.h"           // for begin, cbegin, cend
+#include "../common/optional_weight.h"     // for MakeOptionalWeights, OptionalWeights
+#include "../common/ranking_utils.h"       // for RankingCache, LambdaRankParam, MAPCache, NDCGC...
+#include "../common/threading_utils.h"     // for ParallelFor, Sched
+#include "../common/transform_iterator.h"  // for IndexTransformIter
+#include "init_estimation.h"               // for FitIntercept
+#include "xgboost/base.h"                  // for bst_group_t, GradientPair, kRtEps, GradientPai...
+#include "xgboost/context.h"               // for Context
+#include "xgboost/data.h"                  // for MetaInfo
+#include "xgboost/host_device_vector.h"    // for HostDeviceVector
+#include "xgboost/json.h"                  // for Json, get, Value, ToJson, F32Array, FromJson, IsA
+#include "xgboost/linalg.h"                // for Vector, Range, TensorView, VectorView, All
+#include "xgboost/logging.h"               // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_LE, CHE...
+#include "xgboost/objective.h"             // for ObjFunctionReg, XGBOOST_REGISTER_OBJECTIVE
+#include "xgboost/span.h"                  // for Span, operator!=
+#include "xgboost/string_view.h"           // for operator<<, StringView
+#include "xgboost/task.h"                  // for ObjInfo
+
+namespace xgboost::obj {
+namespace cpu_impl {
+void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
+                                  linalg::VectorView<double const> lj_full,
+                                  linalg::Vector<double>* p_ti_plus,
+                                  linalg::Vector<double>* p_tj_minus, linalg::Vector<double>* p_li,
+                                  linalg::Vector<double>* p_lj,
+                                  std::shared_ptr<ltr::RankingCache> p_cache) {
+  auto ti_plus = p_ti_plus->HostView();
+  auto tj_minus = p_tj_minus->HostView();
+  auto li = p_li->HostView();
+  auto lj = p_lj->HostView();
+
+  auto gptr = p_cache->DataGroupPtr(ctx);
+  auto n_groups = p_cache->Groups();
+  auto regularizer = p_cache->Param().Regularizer();
+
+  // Aggregate over query groups
+  for (bst_group_t g{0}; g < n_groups; ++g) {
+    auto begin = gptr[g];
+    auto end = gptr[g + 1];
+    std::size_t group_size = end - begin;
+    auto n = std::min(group_size, p_cache->MaxPositionSize());
+
+    auto g_li = li_full.Slice(linalg::Range(begin, end));
+    auto g_lj = lj_full.Slice(linalg::Range(begin, end));
+
+    for (std::size_t i{0}; i < n; ++i) {
+      li(i) += g_li(i);
+      lj(i) += g_lj(i);
+    }
+  }
+  // The ti+ is not guaranteed to decrease since it depends on the |\delta Z|
+  //
+  // The update normalizes the ti+ to make ti+(0) equal to 1, which breaks the probability
+  // meaning. The reasoning behind the normalization is not clear, here we are just
+  // following the authors.
+  for (std::size_t i = 0; i < ti_plus.Size(); ++i) {
+    if (li(0) >= Eps64()) {
+      ti_plus(i) = std::pow(li(i) / li(0), regularizer);  // eq.30
+    }
+    if (lj(0) >= Eps64()) {
+      tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);  // eq.31
+    }
+    assert(!std::isinf(ti_plus(i)));
+    assert(!std::isinf(tj_minus(i)));
+  }
+}
+}  // namespace cpu_impl
+
+/**
+ * \brief Base class for pair-wise learning to rank.
+ *
+ *   See `From RankNet to LambdaRank to LambdaMART: An Overview` for a description of the
+ *   algorithm.
+ *
+ *   In addition to ranking, this also implements `Unbiased LambdaMART: An Unbiased
+ *   Pairwise Learning-to-Rank Algorithm`.
+ */
+template <typename Loss, typename Cache>
+class LambdaRankObj : public FitIntercept {
+  MetaInfo const* p_info_{nullptr};
+
+  // Update position biased for unbiased click data
+  void UpdatePositionBias() {
+    li_full_.SetDevice(ctx_->gpu_id);
+    lj_full_.SetDevice(ctx_->gpu_id);
+    li_.SetDevice(ctx_->gpu_id);
+    lj_.SetDevice(ctx_->gpu_id);
+
+    if (ctx_->IsCPU()) {
+      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
+                                             lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
+                                             &li_, &lj_, p_cache_);
+    } else {
+      cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
+                                              lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
+                                              &li_, &lj_, p_cache_);
+    }
+
+    li_full_.Data()->Fill(0.0);
+    lj_full_.Data()->Fill(0.0);
+
+    li_.Data()->Fill(0.0);
+    lj_.Data()->Fill(0.0);
+  }
+
+ protected:
+  // L / tj-* (eq. 30)
+  linalg::Vector<double> li_;
+  // L / ti+* (eq. 31)
+  linalg::Vector<double> lj_;
+  // position bias ratio for relevant doc, ti+ (eq. 30)
+  linalg::Vector<double> ti_plus_;
+  // position bias ratio for irrelevant doc, tj- (eq. 31)
+  linalg::Vector<double> tj_minus_;
+  // li buffer for all samples
+  linalg::Vector<double> li_full_;
+  // lj buffer for all samples
+  linalg::Vector<double> lj_full_;
+
+  ltr::LambdaRankParam param_;
+  // cache
+  std::shared_ptr<ltr::RankingCache> p_cache_;
+
+  [[nodiscard]] std::shared_ptr<Cache> GetCache() const {
+    auto ptr = std::static_pointer_cast<Cache>(p_cache_);
+    CHECK(ptr);
+    return ptr;
+  }
+
+  // get group view for li/lj
+  linalg::VectorView<double> GroupLoss(bst_group_t g, linalg::Vector<double>* v) const {
+    auto gptr = p_cache_->DataGroupPtr(ctx_);
+    auto begin = gptr[g];
+    auto end = gptr[g + 1];
+    if (param_.lambdarank_unbiased) {
+      return v->HostView().Slice(linalg::Range(begin, end));
+    }
+    return v->HostView();
+  }
+
+  // Calculate lambda gradient for each group on CPU.
+  template <bool unbiased, typename Delta>
+  void CalcLambdaForGroup(std::int32_t iter, common::Span<float const> g_predt,
+                          linalg::VectorView<float const> g_label, float w,
+                          common::Span<std::size_t const> g_rank, bst_group_t g, Delta delta,
+                          common::Span<GradientPair> g_gpair) {
+    std::fill_n(g_gpair.data(), g_gpair.size(), GradientPair{});
+    auto p_gpair = g_gpair.data();
+
+    auto ti_plus = ti_plus_.HostView();
+    auto tj_minus = tj_minus_.HostView();
+
+    auto li = GroupLoss(g, &li_full_);
+    auto lj = GroupLoss(g, &lj_full_);
+
+    // Normalization, first used by LightGBM.
+    // https://github.com/microsoft/LightGBM/pull/2331#issuecomment-523259298
+    double sum_lambda{0.0};
+
+    auto delta_op = [&](auto const&... args) { return delta(args..., g); };
+
+    auto loop = [&](std::size_t i, std::size_t j) {
+      // higher/lower on the target ranked list
+      std::size_t rank_high = i, rank_low = j;
+      if (g_label(g_rank[rank_high]) == g_label(g_rank[rank_low])) {
+        return;
+      }
+      if (g_label(g_rank[rank_high]) < g_label(g_rank[rank_low])) {
+        std::swap(rank_high, rank_low);
+      }
+
+      double cost;
+      auto pg = LambdaGrad<unbiased>(g_label, g_predt, g_rank, rank_high, rank_low, delta_op,
+                                     ti_plus, tj_minus, &cost);
+      auto ng = Repulse(pg);
+
+      std::size_t idx_high = g_rank[rank_high];
+      std::size_t idx_low = g_rank[rank_low];
+      p_gpair[idx_high] += pg;
+      p_gpair[idx_low] += ng;
+
+      if (unbiased) {
+        auto k = ti_plus.Size();
+        // We can probably use all the positions. If we skip the update due to having
+        // high/low > k, we might be losing out too many pairs. On the other hand, if we
+        // cap the position, then we might be accumulating too many tail bias into the
+        // last tracked position.
+        // We use `idx_high` since it represents the original position from the label
+        // list, and label list is assumed to be sorted.
+        if (idx_high < k && idx_low < k) {
+          if (tj_minus(idx_low) >= Eps64()) {
+            li(idx_high) += cost / tj_minus(idx_low);  // eq.30
+          }
+          if (ti_plus(idx_high) >= Eps64()) {
+            lj(idx_low) += cost / ti_plus(idx_high);  // eq.31
+          }
+        }
+      }
+
+      sum_lambda += -2.0 * static_cast<double>(pg.GetGrad());
+    };
+
+    MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
+    if (sum_lambda > 0.0) {
+      double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+      std::transform(g_gpair.data(), g_gpair.data() + g_gpair.size(), g_gpair.data(),
+                     [norm](GradientPair const& g) { return g * norm; });
+    }
+
+    auto w_norm = p_cache_->WeightNorm();
+    std::transform(g_gpair.begin(), g_gpair.end(), g_gpair.begin(),
+                   [&](GradientPair const& gpair) { return gpair * w * w_norm; });
+  }
+
+ public:
+  void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String(Loss::Name());
+    out["lambdarank_param"] = ToJson(param_);
+
+    auto save_bias = [](linalg::Vector<double> const& in, Json out) {
+      auto& out_array = get<F32Array>(out);
+      out_array.resize(in.Size());
+      auto h_in = in.HostView();
+      std::copy(linalg::cbegin(h_in), linalg::cend(h_in), out_array.begin());
+    };
+
+    if (param_.lambdarank_unbiased) {
+      out["ti+"] = F32Array();
+      save_bias(ti_plus_, out["ti+"]);
+      out["tj-"] = F32Array();
+      save_bias(tj_minus_, out["tj-"]);
+    }
+  }
+  void LoadConfig(Json const& in) override {
+    auto const& obj = get<Object const>(in);
+    if (obj.find("lambdarank_param") != obj.cend()) {
+      FromJson(in["lambdarank_param"], &param_);
+    }
+
+    if (param_.lambdarank_unbiased) {
+      auto load_bias = [](Json in, linalg::Vector<double>* out) {
+        if (IsA<F32Array>(in)) {
+          // JSON
+          auto const& array = get<F32Array>(in);
+          out->Reshape(array.size());
+          auto h_out = out->HostView();
+          std::copy(array.cbegin(), array.cend(), linalg::begin(h_out));
+        } else {
+          // UBJSON
+          auto const& array = get<Array>(in);
+          out->Reshape(array.size());
+          auto h_out = out->HostView();
+          std::transform(array.cbegin(), array.cend(), linalg::begin(h_out),
+                         [](Json const& v) { return get<Number const>(v); });
+        }
+      };
+      load_bias(in["ti+"], &ti_plus_);
+      load_bias(in["tj-"], &tj_minus_);
+    }
+  }
+
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo{ObjInfo::kRanking}; }
+
+  [[nodiscard]] bst_target_t Targets(MetaInfo const& info) const override {
+    CHECK_LE(info.labels.Shape(1), 1) << "multi-output for LTR is not yet supported.";
+    return 1;
+  }
+
+  [[nodiscard]] const char* RankEvalMetric(StringView metric) const {
+    static thread_local std::string name;
+    if (param_.HasTruncation()) {
+      name = ltr::MakeMetricName(metric, param_.NumPair(), false);
+    } else {
+      name = ltr::MakeMetricName(metric, param_.NotSet(), false);
+    }
+    return name.c_str();
+  }
+
+  void GetGradient(HostDeviceVector<float> const& predt, MetaInfo const& info, std::int32_t iter,
+                   HostDeviceVector<GradientPair>* out_gpair) override {
+    CHECK_EQ(info.labels.Size(), predt.Size()) << error::LabelScoreSize();
+
+    // init/renew cache
+    if (!p_cache_ || p_info_ != &info || p_cache_->Param() != param_) {
+      p_cache_ = std::make_shared<Cache>(ctx_, info, param_);
+      p_info_ = &info;
+    }
+    auto n_groups = p_cache_->Groups();
+    if (!info.weights_.Empty()) {
+      CHECK_EQ(info.weights_.Size(), n_groups) << error::GroupWeight();
+    }
+
+    if (ti_plus_.Size() == 0 && param_.lambdarank_unbiased) {
+      CHECK_EQ(iter, 0);
+      ti_plus_ = linalg::Constant<double>(ctx_, 1.0, p_cache_->MaxPositionSize());
+      tj_minus_ = linalg::Constant<double>(ctx_, 1.0, p_cache_->MaxPositionSize());
+
+      li_ = linalg::Zeros<double>(ctx_, p_cache_->MaxPositionSize());
+      lj_ = linalg::Zeros<double>(ctx_, p_cache_->MaxPositionSize());
+
+      li_full_ = linalg::Zeros<double>(ctx_, info.num_row_);
+      lj_full_ = linalg::Zeros<double>(ctx_, info.num_row_);
+    }
+    static_cast<Loss*>(this)->GetGradientImpl(iter, predt, info, out_gpair);
+
+    if (param_.lambdarank_unbiased) {
+      this->UpdatePositionBias();
+    }
+  }
+};
+
+class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
+ public:
+  template <bool unbiased, bool exp_gain>
+  void CalcLambdaForGroupNDCG(std::int32_t iter, common::Span<float const> g_predt,
+                              linalg::VectorView<float const> g_label, float w,
+                              common::Span<std::size_t const> g_rank,
+                              common::Span<GradientPair> g_gpair,
+                              linalg::VectorView<double const> inv_IDCG,
+                              common::Span<double const> discount, bst_group_t g) {
+    auto delta = [&](auto y_high, auto y_low, std::size_t rank_high, std::size_t rank_low,
+                     bst_group_t g) {
+      static_assert(std::is_floating_point<decltype(y_high)>::value);
+      return DeltaNDCG<exp_gain>(y_high, y_low, rank_high, rank_low, inv_IDCG(g), discount);
+    };
+    this->CalcLambdaForGroup<unbiased>(iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);
+  }
+
+  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
+                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+    if (ctx_->IsCUDA()) {
+      cuda_impl::LambdaRankGetGradientNDCG(
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
+          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
+          out_gpair);
+      return;
+    }
+
+    bst_group_t n_groups = p_cache_->Groups();
+    auto gptr = p_cache_->DataGroupPtr(ctx_);
+
+    out_gpair->Resize(info.num_row_);
+    auto h_gpair = out_gpair->HostSpan();
+    auto h_predt = predt.ConstHostSpan();
+    auto h_label = info.labels.HostView();
+    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+
+    auto dct = GetCache()->Discount(ctx_);
+    auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
+    auto inv_IDCG = GetCache()->InvIDCG(ctx_);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), common::Sched::Guided(), [&](auto g) {
+      std::size_t cnt = gptr[g + 1] - gptr[g];
+      auto w = h_weight[g];
+      auto g_predt = h_predt.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_label = h_label.Slice(make_range(g), 0);
+      auto g_rank = rank_idx.subspan(gptr[g], cnt);
+
+      auto args =
+          std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g_gpair, inv_IDCG, dct, g);
+
+      if (param_.lambdarank_unbiased) {
+        if (param_.ndcg_exp_gain) {
+          std::apply(&LambdaRankNDCG::CalcLambdaForGroupNDCG<true, true>, args);
+        } else {
+          std::apply(&LambdaRankNDCG::CalcLambdaForGroupNDCG<true, false>, args);
+        }
+      } else {
+        if (param_.ndcg_exp_gain) {
+          std::apply(&LambdaRankNDCG::CalcLambdaForGroupNDCG<false, true>, args);
+        } else {
+          std::apply(&LambdaRankNDCG::CalcLambdaForGroupNDCG<false, false>, args);
+        }
+      }
+    });
+  }
+
+  static char const* Name() { return "rank:ndcg"; }
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
+    return this->RankEvalMetric("ndcg");
+  }
+  [[nodiscard]] Json DefaultMetricConfig() const override {
+    Json config{Object{}};
+    config["name"] = String{DefaultEvalMetric()};
+    config["lambdarank_param"] = ToJson(param_);
+    return config;
+  }
+};
+
+namespace cuda_impl {
+#if !defined(XGBOOST_USE_CUDA)
+void LambdaRankGetGradientNDCG(Context const*, std::int32_t, HostDeviceVector<float> const&,
+                               const MetaInfo&, std::shared_ptr<ltr::NDCGCache>,
+                               linalg::VectorView<double const>,  // input bias ratio
+                               linalg::VectorView<double const>,  // input bias ratio
+                               linalg::VectorView<double>, linalg::VectorView<double>,
+                               HostDeviceVector<GradientPair>*) {
+  common::AssertGPUSupport();
+}
+
+void LambdaRankUpdatePositionBias(Context const*, linalg::VectorView<double const>,
+                                  linalg::VectorView<double const>, linalg::Vector<double>*,
+                                  linalg::Vector<double>*, linalg::Vector<double>*,
+                                  linalg::Vector<double>*, std::shared_ptr<ltr::RankingCache>) {
+  common::AssertGPUSupport();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace cuda_impl
+
+XGBOOST_REGISTER_OBJECTIVE(LambdaRankNDCG, LambdaRankNDCG::Name())
+    .describe("LambdaRank with NDCG loss as objective")
+    .set_body([]() { return new LambdaRankNDCG{}; });
+
+DMLC_REGISTRY_FILE_TAG(lambdarank_obj);
+}  // namespace xgboost::obj
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index eb82b17b4..27b5872a8 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -37,6 +37,312 @@ namespace xgboost::obj {
 DMLC_REGISTRY_FILE_TAG(lambdarank_obj_cu);
 
 namespace cuda_impl {
+namespace {
+/**
+ * \brief Calculate minimum value of bias for floating point truncation.
+ */
+void MinBias(Context const* ctx, std::shared_ptr<ltr::RankingCache> p_cache,
+             linalg::VectorView<double const> t_plus, linalg::VectorView<double const> tj_minus,
+             common::Span<double> d_min) {
+  CHECK_EQ(d_min.size(), 2);
+  auto cuctx = ctx->CUDACtx();
+
+  auto k = t_plus.Size();
+  auto const& p = p_cache->Param();
+  CHECK_GT(k, 0);
+  CHECK_EQ(k, p_cache->MaxPositionSize());
+
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return i * k; });
+  auto val_it = dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul),
+                                                  [=] XGBOOST_DEVICE(std::size_t i) {
+                                                    if (i >= k) {
+                                                      return std::abs(tj_minus(i - k));
+                                                    }
+                                                    return std::abs(t_plus(i));
+                                                  });
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Min(nullptr, bytes, val_it, d_min.data(), 2, key_it, key_it + 1,
+                                  cuctx->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Min(temp.data().get(), bytes, val_it, d_min.data(), 2, key_it,
+                                  key_it + 1, cuctx->Stream());
+}
+
+/**
+ * \brief Type for gradient statistic. (Gradient, cost for unbiased LTR, normalization factor)
+ */
+using GradCostNorm = thrust::tuple<GradientPair, double, double>;
+
+/**
+ * \brief Obtain and update the gradient for one pair.
+ */
+template <bool unbiased, bool has_truncation, typename Delta>
+struct GetGradOp {
+  MakePairsOp<has_truncation> make_pair;
+  Delta delta;
+
+  bool need_update;
+
+  auto __device__ operator()(std::size_t idx) -> GradCostNorm {
+    auto const& args = make_pair.args;
+    auto g = dh::SegmentId(args.d_threads_group_ptr, idx);
+
+    auto data_group_begin = static_cast<std::size_t>(args.d_group_ptr[g]);
+    std::size_t n_data = args.d_group_ptr[g + 1] - data_group_begin;
+    // obtain group segment data.
+    auto g_label = args.labels.Slice(linalg::Range(data_group_begin, data_group_begin + n_data), 0);
+    auto g_predt = args.predts.subspan(data_group_begin, n_data);
+    auto g_gpair = args.gpairs.subspan(data_group_begin, n_data).data();
+    auto g_rank = args.d_sorted_idx.subspan(data_group_begin, n_data);
+
+    auto [i, j] = make_pair(idx, g);
+
+    std::size_t rank_high = i, rank_low = j;
+    if (g_label(g_rank[i]) == g_label(g_rank[j])) {
+      return thrust::make_tuple(GradientPair{}, 0.0, 0.0);
+    }
+    if (g_label(g_rank[i]) < g_label(g_rank[j])) {
+      thrust::swap(rank_high, rank_low);
+    }
+
+    double cost{0};
+
+    auto delta_op = [&](auto const&... args) { return delta(args..., g); };
+    GradientPair pg = LambdaGrad<unbiased>(g_label, g_predt, g_rank, rank_high, rank_low, delta_op,
+                                           args.ti_plus, args.tj_minus, &cost);
+
+    std::size_t idx_high = g_rank[rank_high];
+    std::size_t idx_low = g_rank[rank_low];
+
+    if (need_update) {
+      // second run, update the gradient
+
+      auto ng = Repulse(pg);
+
+      auto gr = args.d_roundings(g);
+      // positive gradient truncated
+      auto pgt = GradientPair{common::TruncateWithRounding(gr.GetGrad(), pg.GetGrad()),
+                              common::TruncateWithRounding(gr.GetHess(), pg.GetHess())};
+      // negative gradient truncated
+      auto ngt = GradientPair{common::TruncateWithRounding(gr.GetGrad(), ng.GetGrad()),
+                              common::TruncateWithRounding(gr.GetHess(), ng.GetHess())};
+
+      dh::AtomicAddGpair(g_gpair + idx_high, pgt);
+      dh::AtomicAddGpair(g_gpair + idx_low, ngt);
+    }
+
+    if (unbiased && need_update) {
+      // second run, update the cost
+      assert(args.tj_minus.Size() == args.ti_plus.Size() && "Invalid size of position bias");
+
+      auto g_li = args.li.Slice(linalg::Range(data_group_begin, data_group_begin + n_data));
+      auto g_lj = args.lj.Slice(linalg::Range(data_group_begin, data_group_begin + n_data));
+
+      if (idx_high < args.ti_plus.Size() && idx_low < args.ti_plus.Size()) {
+        if (args.tj_minus(idx_low) >= Eps64()) {
+          // eq.30
+          atomicAdd(&g_li(idx_high), common::TruncateWithRounding(args.d_cost_rounding[0],
+                                                                  cost / args.tj_minus(idx_low)));
+        }
+        if (args.ti_plus(idx_high) >= Eps64()) {
+          // eq.31
+          atomicAdd(&g_lj(idx_low), common::TruncateWithRounding(args.d_cost_rounding[0],
+                                                                 cost / args.ti_plus(idx_high)));
+        }
+      }
+    }
+    return thrust::make_tuple(GradientPair{std::abs(pg.GetGrad()), std::abs(pg.GetHess())},
+                              std::abs(cost), -2.0 * static_cast<double>(pg.GetGrad()));
+  }
+};
+
+template <bool unbiased, bool has_truncation, typename Delta>
+struct MakeGetGrad {
+  MakePairsOp<has_truncation> make_pair;
+  Delta delta;
+
+  [[nodiscard]] KernelInputs const& Args() const { return make_pair.args; }
+
+  MakeGetGrad(KernelInputs args, Delta d) : make_pair{args}, delta{std::move(d)} {}
+
+  GetGradOp<unbiased, has_truncation, Delta> operator()(bool need_update) {
+    return GetGradOp<unbiased, has_truncation, Delta>{make_pair, delta, need_update};
+  }
+};
+
+/**
+ * \brief Calculate gradient for all pairs using update op created by make_get_grad.
+ *
+ * We need to run gradient calculation twice, the first time gathers infomation like
+ * maximum gradient, maximum cost, and the normalization term using reduction. The second
+ * time performs the actual update.
+ *
+ * Without normalization, we only need to run it once since we can manually calculate
+ * the bounds of gradient (NDCG \in [0, 1], delta_NDCG \in [0, 1], ti+/tj- are from the
+ * previous iteration so the bound can be calculated for current iteration). However, if
+ * normalization is used, the delta score is un-bounded and we need to obtain the sum
+ * gradient. As a tradeoff, we simply run the kernel twice, once as reduction, second
+ * one as for_each.
+ *
+ * Alternatively, we can bound the delta score by limiting the output of the model using
+ * sigmoid for binary output and some normalization for multi-level. But effect to the
+ * accuracy is not known yet, and it's only used by GPU.
+ *
+ * For performance, the segmented sort for sorted scores is the bottleneck and takes up
+ * about half of the time, while the reduction and for_each takes up the second half.
+ */
+template <bool unbiased, bool has_truncation, typename Delta>
+void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::RankingCache> p_cache,
+              MakeGetGrad<unbiased, has_truncation, Delta> make_get_grad) {
+  auto n_groups = p_cache->Groups();
+  auto d_threads_group_ptr = p_cache->CUDAThreadsGroupPtr();
+  auto d_gptr = p_cache->DataGroupPtr(ctx);
+  auto d_gpair = make_get_grad.Args().gpairs;
+
+  /**
+   * First pass, gather info for normalization and rounding factor.
+   */
+  auto val_it = dh::MakeTransformIterator<GradCostNorm>(thrust::make_counting_iterator(0ul),
+                                                        make_get_grad(false));
+  auto reduction_op = [] XGBOOST_DEVICE(GradCostNorm const& l,
+                                        GradCostNorm const& r) -> GradCostNorm {
+    // get maximum gradient for each group, along with cost and the normalization term
+    auto const& lg = thrust::get<0>(l);
+    auto const& rg = thrust::get<0>(r);
+    auto grad = std::max(lg.GetGrad(), rg.GetGrad());
+    auto hess = std::max(lg.GetHess(), rg.GetHess());
+    auto cost = std::max(thrust::get<1>(l), thrust::get<1>(r));
+    double sum_lambda = thrust::get<2>(l) + thrust::get<2>(r);
+    return thrust::make_tuple(GradientPair{std::abs(grad), std::abs(hess)}, cost, sum_lambda);
+  };
+  auto init = thrust::make_tuple(GradientPair{0.0f, 0.0f}, 0.0, 0.0);
+  common::Span<GradCostNorm> d_max_lambdas = p_cache->MaxLambdas<GradCostNorm>(ctx, n_groups);
+  CHECK_EQ(n_groups * sizeof(GradCostNorm), d_max_lambdas.size_bytes());
+
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, d_max_lambdas.data(), n_groups,
+                                     d_threads_group_ptr.data(), d_threads_group_ptr.data() + 1,
+                                     reduction_op, init, ctx->CUDACtx()->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Reduce(
+      temp.data().get(), bytes, val_it, d_max_lambdas.data(), n_groups, d_threads_group_ptr.data(),
+      d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream());
+
+  dh::TemporaryArray<double> min_bias(2);
+  auto d_min_bias = dh::ToSpan(min_bias);
+  if (unbiased) {
+    MinBias(ctx, p_cache, make_get_grad.Args().ti_plus, make_get_grad.Args().tj_minus, d_min_bias);
+  }
+  /**
+   * Create rounding factors
+   */
+  auto d_cost_rounding = p_cache->CUDACostRounding(ctx);
+  auto d_rounding = p_cache->CUDARounding(ctx);
+  dh::LaunchN(n_groups, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t g) mutable {
+    auto group_size = d_gptr[g + 1] - d_gptr[g];
+    auto const& max_grad = thrust::get<0>(d_max_lambdas[g]);
+    // float group size
+    auto fgs = static_cast<float>(group_size);
+    auto grad = common::CreateRoundingFactor(fgs * max_grad.GetGrad(), group_size);
+    auto hess = common::CreateRoundingFactor(fgs * max_grad.GetHess(), group_size);
+    d_rounding(g) = GradientPair{grad, hess};
+
+    auto cost = thrust::get<1>(d_max_lambdas[g]);
+    if (unbiased) {
+      cost /= std::min(d_min_bias[0], d_min_bias[1]);
+      d_cost_rounding[0] = common::CreateRoundingFactor(fgs * cost, group_size);
+    }
+  });
+
+  /**
+   * Second pass, actual update to gradient and bias.
+   */
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul),
+                     p_cache->CUDAThreads(), make_get_grad(true));
+
+  /**
+   * Lastly, normalization and weight.
+   */
+  auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
+  auto w_norm = p_cache->WeightNorm();
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) {
+                       auto g = dh::SegmentId(d_gptr, i);
+                       auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
+                       // Normalization
+                       if (sum_lambda > 0.0) {
+                         double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
+                         d_gpair[i] *= norm;
+                       }
+                       d_gpair[i] *= (d_weights[g] * w_norm);
+                     });
+}
+
+/**
+ * \brief Handles boilerplate code like getting device span.
+ */
+template <typename Delta>
+void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const& preds,
+            const MetaInfo& info, std::shared_ptr<ltr::RankingCache> p_cache, Delta delta,
+            linalg::VectorView<double const> ti_plus,   // input bias ratio
+            linalg::VectorView<double const> tj_minus,  // input bias ratio
+            linalg::VectorView<double> li, linalg::VectorView<double> lj,
+            HostDeviceVector<GradientPair>* out_gpair) {
+  // boilerplate
+  std::int32_t device_id = ctx->gpu_id;
+  dh::safe_cuda(cudaSetDevice(device_id));
+  auto n_groups = p_cache->Groups();
+
+  info.labels.SetDevice(device_id);
+  preds.SetDevice(device_id);
+  out_gpair->SetDevice(device_id);
+  out_gpair->Resize(preds.Size());
+
+  CHECK(p_cache);
+
+  auto d_rounding = p_cache->CUDARounding(ctx);
+  auto d_cost_rounding = p_cache->CUDACostRounding(ctx);
+
+  CHECK_NE(d_rounding.Size(), 0);
+
+  auto label = info.labels.View(ctx->gpu_id);
+  auto predts = preds.ConstDeviceSpan();
+  auto gpairs = out_gpair->DeviceSpan();
+  thrust::fill_n(ctx->CUDACtx()->CTP(), gpairs.data(), gpairs.size(), GradientPair{0.0f, 0.0f});
+
+  auto const d_threads_group_ptr = p_cache->CUDAThreadsGroupPtr();
+  auto const d_gptr = p_cache->DataGroupPtr(ctx);
+  auto const rank_idx = p_cache->SortedIdx(ctx, predts);
+
+  auto const unbiased = p_cache->Param().lambdarank_unbiased;
+
+  common::Span<std::size_t const> d_y_sorted_idx;
+  if (!p_cache->Param().HasTruncation()) {
+    d_y_sorted_idx = SortY(ctx, info, rank_idx, p_cache);
+  }
+
+  KernelInputs args{ti_plus,        tj_minus, li,     lj,     d_gptr,     d_threads_group_ptr,
+                    rank_idx,       label,    predts, gpairs, d_rounding, d_cost_rounding.data(),
+                    d_y_sorted_idx, iter};
+
+  // dispatch based on unbiased and truncation
+  if (p_cache->Param().HasTruncation()) {
+    if (unbiased) {
+      CalcGrad(ctx, info, p_cache, MakeGetGrad<true, true, Delta>{args, delta});
+    } else {
+      CalcGrad(ctx, info, p_cache, MakeGetGrad<false, true, Delta>{args, delta});
+    }
+  } else {
+    if (unbiased) {
+      CalcGrad(ctx, info, p_cache, MakeGetGrad<true, false, Delta>{args, delta});
+    } else {
+      CalcGrad(ctx, info, p_cache, MakeGetGrad<false, false, Delta>{args, delta});
+    }
+  }
+}
+}  // anonymous namespace
+
 common::Span<std::size_t const> SortY(Context const* ctx, MetaInfo const& info,
                                       common::Span<std::size_t const> d_rank,
                                       std::shared_ptr<ltr::RankingCache> p_cache) {
@@ -58,5 +364,116 @@ common::Span<std::size_t const> SortY(Context const* ctx, MetaInfo const& info,
   common::SegmentedArgSort<false, true>(ctx, d_y_ranked, d_group_ptr, d_y_sorted_idx);
   return d_y_sorted_idx;
 }
+
+void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
+                               const HostDeviceVector<float>& preds, const MetaInfo& info,
+                               std::shared_ptr<ltr::NDCGCache> p_cache,
+                               linalg::VectorView<double const> ti_plus,   // input bias ratio
+                               linalg::VectorView<double const> tj_minus,  // input bias ratio
+                               linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                               HostDeviceVector<GradientPair>* out_gpair) {
+  // boilerplate
+  std::int32_t device_id = ctx->gpu_id;
+  dh::safe_cuda(cudaSetDevice(device_id));
+  auto const d_inv_IDCG = p_cache->InvIDCG(ctx);
+  auto const discount = p_cache->Discount(ctx);
+
+  info.labels.SetDevice(device_id);
+  preds.SetDevice(device_id);
+
+  auto const exp_gain = p_cache->Param().ndcg_exp_gain;
+  auto delta_ndcg = [=] XGBOOST_DEVICE(float y_high, float y_low, std::size_t rank_high,
+                                       std::size_t rank_low, bst_group_t g) {
+    return exp_gain ? DeltaNDCG<true>(y_high, y_low, rank_high, rank_low, d_inv_IDCG(g), discount)
+                    : DeltaNDCG<false>(y_high, y_low, rank_high, rank_low, d_inv_IDCG(g), discount);
+  };
+  Launch(ctx, iter, preds, info, p_cache, delta_ndcg, ti_plus, tj_minus, li, lj, out_gpair);
+}
+
+namespace {
+struct ReduceOp {
+  template <typename Tup>
+  Tup XGBOOST_DEVICE operator()(Tup const& l, Tup const& r) {
+    return thrust::make_tuple(thrust::get<0>(l) + thrust::get<0>(r),
+                              thrust::get<1>(l) + thrust::get<1>(r));
+  }
+};
+}  // namespace
+
+void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
+                                  linalg::VectorView<double const> lj_full,
+                                  linalg::Vector<double>* p_ti_plus,
+                                  linalg::Vector<double>* p_tj_minus,
+                                  linalg::Vector<double>* p_li,  // loss
+                                  linalg::Vector<double>* p_lj,
+                                  std::shared_ptr<ltr::RankingCache> p_cache) {
+  auto const d_group_ptr = p_cache->DataGroupPtr(ctx);
+  auto n_groups = d_group_ptr.size() - 1;
+
+  auto ti_plus = p_ti_plus->View(ctx->gpu_id);
+  auto tj_minus = p_tj_minus->View(ctx->gpu_id);
+
+  auto li = p_li->View(ctx->gpu_id);
+  auto lj = p_lj->View(ctx->gpu_id);
+  CHECK_EQ(li.Size(), ti_plus.Size());
+
+  auto const& param = p_cache->Param();
+  auto regularizer = param.Regularizer();
+  std::size_t k = p_cache->MaxPositionSize();
+
+  CHECK_EQ(li.Size(), k);
+  CHECK_EQ(lj.Size(), k);
+  // reduce li_full to li for each group.
+  auto make_iter = [&](linalg::VectorView<double const> l_full) {
+    auto l_it = [=] XGBOOST_DEVICE(std::size_t i) {
+      // group index
+      auto g = i % n_groups;
+      // rank is the position within a group, also the segment index
+      auto r = i / n_groups;
+
+      auto begin = d_group_ptr[g];
+      std::size_t group_size = d_group_ptr[g + 1] - begin;
+      auto n = std::min(group_size, k);
+      // r can be greater than n since we allocate threads based on truncation level
+      // instead of actual group size.
+      if (r >= n) {
+        return 0.0;
+      }
+      return l_full(r + begin);
+    };
+    return l_it;
+  };
+  auto li_it =
+      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), make_iter(li_full));
+  auto lj_it =
+      dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul), make_iter(lj_full));
+  // k segments, each segment has size n_groups.
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) { return i * n_groups; });
+  auto val_it = thrust::make_zip_iterator(thrust::make_tuple(li_it, lj_it));
+  auto out_it =
+      thrust::make_zip_iterator(thrust::make_tuple(li.Values().data(), lj.Values().data()));
+
+  auto init = thrust::make_tuple(0.0, 0.0);
+  std::size_t bytes;
+  cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, out_it, k, key_it, key_it + 1,
+                                     ReduceOp{}, init, ctx->CUDACtx()->Stream());
+  dh::TemporaryArray<char> temp(bytes);
+  cub::DeviceSegmentedReduce::Reduce(temp.data().get(), bytes, val_it, out_it, k, key_it,
+                                     key_it + 1, ReduceOp{}, init, ctx->CUDACtx()->Stream());
+
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), li.Size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) mutable {
+                       if (li(0) >= Eps64()) {
+                         ti_plus(i) = std::pow(li(i) / li(0), regularizer);
+                       }
+                       if (lj(0) >= Eps64()) {
+                         tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);
+                       }
+                       assert(!std::isinf(ti_plus(i)));
+                       assert(!std::isinf(tj_minus(i)));
+                     });
+}
 }  // namespace cuda_impl
 }  // namespace xgboost::obj
diff --git a/src/objective/lambdarank_obj.h b/src/objective/lambdarank_obj.h
index 3adb27a2e..0eb06e27c 100644
--- a/src/objective/lambdarank_obj.h
+++ b/src/objective/lambdarank_obj.h
@@ -1,5 +1,15 @@
 /**
- * Copyright 2023 XGBoost contributors
+ * Copyright 2023, XGBoost contributors
+ *
+ * Vocabulary explanation:
+ *
+ * There are two different lists we need to handle in the objective, first is the list of
+ * labels (relevance degree) provided by the user. Its order has no particular meaning
+ * when bias estimation is NOT used. Another one is generated by our model, sorted index
+ * based on prediction scores. `rank_high` refers to the position index of the model rank
+ * list that is higher than `rank_low`, while `idx_high` refers to where does the
+ * `rank_high` sample comes from. Simply put, `rank_high` indexes into the rank list
+ * obtained from the model, while `idx_high` indexes into the user provided sample list.
  */
 #ifndef XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
 #define XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_H_
@@ -25,14 +35,19 @@
 #include "xgboost/span.h"                  // for Span
 
 namespace xgboost::obj {
+double constexpr Eps64() { return 1e-16; }
+
 template <bool exp>
-XGBOOST_DEVICE double DeltaNDCG(float y_high, float y_low, std::size_t r_high, std::size_t r_low,
-                                double inv_IDCG, common::Span<double const> discount) {
+XGBOOST_DEVICE double DeltaNDCG(float y_high, float y_low, std::size_t rank_high,
+                                std::size_t rank_low, double inv_IDCG,
+                                common::Span<double const> discount) {
+  // Use rank_high instead of idx_high as we are calculating discount based on ranks
+  // provided by the model.
   double gain_high = exp ? ltr::CalcDCGGain(y_high) : y_high;
-  double discount_high = discount[r_high];
+  double discount_high = discount[rank_high];
 
   double gain_low = exp ? ltr::CalcDCGGain(y_low) : y_low;
-  double discount_low = discount[r_low];
+  double discount_low = discount[rank_low];
 
   double original = gain_high * discount_high + gain_low * discount_low;
   double changed = gain_low * discount_high + gain_high * discount_low;
@@ -70,9 +85,9 @@ template <bool unbiased, typename Delta>
 XGBOOST_DEVICE GradientPair
 LambdaGrad(linalg::VectorView<float const> labels, common::Span<float const> predts,
            common::Span<size_t const> sorted_idx,
-           std::size_t rank_high,                     // cordiniate
-           std::size_t rank_low,                      // cordiniate
-           Delta delta,                               // delta score
+           std::size_t rank_high,                     // higher index on the model rank list
+           std::size_t rank_low,                      // lower index on the model rank list
+           Delta delta,                               // function to calculate delta score
            linalg::VectorView<double const> t_plus,   // input bias ratio
            linalg::VectorView<double const> t_minus,  // input bias ratio
            double* p_cost) {
@@ -95,30 +110,34 @@ LambdaGrad(linalg::VectorView<float const> labels, common::Span<float const> pre
 
   // Use double whenever possible as we are working on the exp space.
   double delta_score = std::abs(s_high - s_low);
-  double sigmoid = common::Sigmoid(s_high - s_low);
+  double const sigmoid = common::Sigmoid(s_high - s_low);
   // Change in metric score like \delta NDCG or \delta MAP
   double delta_metric = std::abs(delta(y_high, y_low, rank_high, rank_low));
 
   if (best_score != worst_score) {
-    delta_metric /= (delta_score + kRtEps);
+    delta_metric /= (delta_score + 0.01);
   }
 
   if (unbiased) {
     *p_cost = std::log(1.0 / (1.0 - sigmoid)) * delta_metric;
   }
 
-  constexpr double kEps = 1e-16;
   auto lambda_ij = (sigmoid - 1.0) * delta_metric;
-  auto hessian_ij = std::max(sigmoid * (1.0 - sigmoid), kEps) * delta_metric * 2.0;
+  auto hessian_ij = std::max(sigmoid * (1.0 - sigmoid), Eps64()) * delta_metric * 2.0;
 
   auto k = t_plus.Size();
   assert(t_minus.Size() == k && "Invalid size of position bias");
 
-  if (unbiased && idx_high < k && idx_low < k) {
-    lambda_ij /= (t_minus(idx_low) * t_plus(idx_high) + kRtEps);
-    hessian_ij /= (t_minus(idx_low) * t_plus(idx_high) + kRtEps);
+  // We need to skip samples that exceed the maximum number of tracked positions, and
+  // samples that have low probability and might bring us floating point issues.
+  if (unbiased && idx_high < k && idx_low < k && t_minus(idx_low) >= Eps64() &&
+      t_plus(idx_high) >= Eps64()) {
+    // The index should be ranks[idx_low], since we assume label is sorted, this reduces
+    // to `idx_low`, which represents the position on the input list, as explained in the
+    // file header.
+    lambda_ij /= (t_plus(idx_high) * t_minus(idx_low));
+    hessian_ij /= (t_plus(idx_high) * t_minus(idx_low));
   }
-
   auto pg = GradientPair{static_cast<float>(lambda_ij), static_cast<float>(hessian_ij)};
   return pg;
 }
@@ -137,27 +156,6 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
                                linalg::VectorView<double> li, linalg::VectorView<double> lj,
                                HostDeviceVector<GradientPair>* out_gpair);
 
-/**
- * \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
- */
-void MAPStat(Context const* ctx, MetaInfo const& info, common::Span<std::size_t const> d_rank_idx,
-             std::shared_ptr<ltr::MAPCache> p_cache);
-
-void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
-                              HostDeviceVector<float> const& predt, MetaInfo const& info,
-                              std::shared_ptr<ltr::MAPCache> p_cache,
-                              linalg::VectorView<double const> t_plus,   // input bias ratio
-                              linalg::VectorView<double const> t_minus,  // input bias ratio
-                              linalg::VectorView<double> li, linalg::VectorView<double> lj,
-                              HostDeviceVector<GradientPair>* out_gpair);
-
-void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
-                                   HostDeviceVector<float> const& predt, const MetaInfo& info,
-                                   std::shared_ptr<ltr::RankingCache> p_cache,
-                                   linalg::VectorView<double const> ti_plus,   // input bias ratio
-                                   linalg::VectorView<double const> tj_minus,  // input bias ratio
-                                   linalg::VectorView<double> li, linalg::VectorView<double> lj,
-                                   HostDeviceVector<GradientPair>* out_gpair);
 
 void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
                                   linalg::VectorView<double const> lj_full,
@@ -167,18 +165,6 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
                                   std::shared_ptr<ltr::RankingCache> p_cache);
 }  // namespace cuda_impl
 
-namespace cpu_impl {
-/**
- * \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
- *
- * \param label    Ground truth relevance label.
- * \param rank_idx Sorted index of prediction.
- * \param p_cache  An initialized MAPCache.
- */
-void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
-             common::Span<std::size_t const> rank_idx, std::shared_ptr<ltr::MAPCache> p_cache);
-}  // namespace cpu_impl
-
 /**
  * \param Construct pairs on CPU
  *
diff --git a/src/objective/objective.cc b/src/objective/objective.cc
index d3b01d80b..7d2c37811 100644
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -48,12 +48,15 @@ DMLC_REGISTRY_LINK_TAG(quantile_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(hinge_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(rank_obj_gpu);
+DMLC_REGISTRY_LINK_TAG(lambdarank_obj);
+DMLC_REGISTRY_LINK_TAG(lambdarank_obj_cu);
 #else
 DMLC_REGISTRY_LINK_TAG(regression_obj);
 DMLC_REGISTRY_LINK_TAG(quantile_obj);
 DMLC_REGISTRY_LINK_TAG(hinge_obj);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj);
 DMLC_REGISTRY_LINK_TAG(rank_obj);
+DMLC_REGISTRY_LINK_TAG(lambdarank_obj);
 #endif  // XGBOOST_USE_CUDA
 }  // namespace obj
 }  // namespace xgboost
diff --git a/src/objective/rank_obj.cu b/src/objective/rank_obj.cu
index f1c870210..23613d93d 100644
--- a/src/objective/rank_obj.cu
+++ b/src/objective/rank_obj.cu
@@ -207,174 +207,6 @@ class IndexablePredictionSorter {
 };
 #endif
 
-// beta version: NDCG lambda rank
-class NDCGLambdaWeightComputer
-#if defined(__CUDACC__)
-  : public IndexablePredictionSorter
-#endif
-{
- public:
-#if defined(__CUDACC__)
-  // This function object computes the item's DCG value
-  class ComputeItemDCG : public thrust::unary_function<uint32_t, float> {
-   public:
-    XGBOOST_DEVICE ComputeItemDCG(const common::Span<const float> &dsorted_labels,
-                                  const common::Span<const uint32_t> &dgroups,
-                                  const common::Span<const uint32_t> &gidxs)
-      : dsorted_labels_(dsorted_labels),
-        dgroups_(dgroups),
-        dgidxs_(gidxs) {}
-
-    // Compute DCG for the item at 'idx'
-    __device__ __forceinline__ float operator()(uint32_t idx) const {
-      return ComputeItemDCGWeight(dsorted_labels_[idx], idx - dgroups_[dgidxs_[idx]]);
-    }
-
-   private:
-    const common::Span<const float> dsorted_labels_;  // Labels sorted within a group
-    const common::Span<const uint32_t> dgroups_;  // The group indices - where each group
-                                                  // begins and ends
-    const common::Span<const uint32_t> dgidxs_;  // The group each items belongs to
-  };
-
-  // Type containing device pointers that can be cheaply copied on the kernel
-  class NDCGLambdaWeightMultiplier : public BaseLambdaWeightMultiplier {
-   public:
-    NDCGLambdaWeightMultiplier(const dh::SegmentSorter<float> &segment_label_sorter,
-                               const NDCGLambdaWeightComputer &lwc)
-      : BaseLambdaWeightMultiplier(segment_label_sorter, lwc.GetPredictionSorter()),
-        dgroup_dcgs_(lwc.GetGroupDcgsSpan()) {}
-
-    // Adjust the items weight by this value
-    __device__ __forceinline__ bst_float GetWeight(uint32_t gidx, int pidx, int nidx) const {
-      if (dgroup_dcgs_[gidx] == 0.0) return 0.0f;
-
-      uint32_t group_begin = dgroups_[gidx];
-
-      auto pos_lab_orig_posn = dorig_pos_[pidx];
-      auto neg_lab_orig_posn = dorig_pos_[nidx];
-      KERNEL_CHECK(pos_lab_orig_posn != neg_lab_orig_posn);
-
-      // Note: the label positive and negative indices are relative to the entire dataset.
-      // Hence, scale them back to an index within the group
-      auto pos_pred_pos = dindexable_sorted_preds_pos_[pos_lab_orig_posn] - group_begin;
-      auto neg_pred_pos = dindexable_sorted_preds_pos_[neg_lab_orig_posn] - group_begin;
-      return NDCGLambdaWeightComputer::ComputeDeltaWeight(
-        pos_pred_pos, neg_pred_pos,
-        static_cast<int>(dsorted_labels_[pidx]), static_cast<int>(dsorted_labels_[nidx]),
-        dgroup_dcgs_[gidx]);
-    }
-
-   private:
-     const common::Span<const float> dgroup_dcgs_;  // Group DCG values
-  };
-
-  NDCGLambdaWeightComputer(const bst_float *dpreds,
-                           const bst_float*,
-                           const dh::SegmentSorter<float> &segment_label_sorter)
-    : IndexablePredictionSorter(dpreds, segment_label_sorter),
-      dgroup_dcg_(segment_label_sorter.GetNumGroups(), 0.0f),
-      weight_multiplier_(segment_label_sorter, *this) {
-    const auto &group_segments = segment_label_sorter.GetGroupSegmentsSpan();
-
-    // Allocator to be used for managing space overhead while performing transformed reductions
-    dh::XGBCachingDeviceAllocator<char> alloc;
-
-    // Compute each elements DCG values and reduce them across groups concurrently.
-    auto end_range =
-      thrust::reduce_by_key(thrust::cuda::par(alloc),
-                            dh::tcbegin(group_segments), dh::tcend(group_segments),
-                            thrust::make_transform_iterator(
-                              // The indices need not be sequential within a group, as we care only
-                              // about the sum of items DCG values within a group
-                              dh::tcbegin(segment_label_sorter.GetOriginalPositionsSpan()),
-                              ComputeItemDCG(segment_label_sorter.GetItemsSpan(),
-                                             segment_label_sorter.GetGroupsSpan(),
-                                             group_segments)),
-                            thrust::make_discard_iterator(),  // We don't care for the group indices
-                            dgroup_dcg_.begin());  // Sum of the item's DCG values in the group
-    CHECK_EQ(static_cast<unsigned>(end_range.second - dgroup_dcg_.begin()), dgroup_dcg_.size());
-  }
-
-  inline const common::Span<const float> GetGroupDcgsSpan() const {
-    return { dgroup_dcg_.data().get(), dgroup_dcg_.size() };
-  }
-
-  inline const NDCGLambdaWeightMultiplier GetWeightMultiplier() const {
-    return weight_multiplier_;
-  }
-#endif
-
-  static void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                              std::vector<LambdaPair> *io_pairs) {
-    std::vector<LambdaPair> &pairs = *io_pairs;
-    float IDCG;  // NOLINT
-    {
-      std::vector<bst_float> labels(sorted_list.size());
-      for (size_t i = 0; i < sorted_list.size(); ++i) {
-        labels[i] = sorted_list[i].label;
-      }
-      std::stable_sort(labels.begin(), labels.end(), std::greater<>());
-      IDCG = ComputeGroupDCGWeight(&labels[0], labels.size());
-    }
-    if (IDCG == 0.0) {
-      for (auto & pair : pairs) {
-        pair.weight = 0.0f;
-      }
-    } else {
-      for (auto & pair : pairs) {
-        unsigned pos_idx = pair.pos_index;
-        unsigned neg_idx = pair.neg_index;
-        pair.weight *= ComputeDeltaWeight(pos_idx, neg_idx,
-                                          sorted_list[pos_idx].label, sorted_list[neg_idx].label,
-                                          IDCG);
-      }
-    }
-  }
-
-  static char const* Name() {
-    return "rank:ndcg";
-  }
-
-  inline static bst_float ComputeGroupDCGWeight(const float *sorted_labels, uint32_t size) {
-    double sumdcg = 0.0;
-    for (uint32_t i = 0; i < size; ++i) {
-      sumdcg += ComputeItemDCGWeight(sorted_labels[i], i);
-    }
-
-    return static_cast<bst_float>(sumdcg);
-  }
-
- private:
-  XGBOOST_DEVICE inline static bst_float ComputeItemDCGWeight(unsigned label, uint32_t idx) {
-    return (label != 0) ? (((1 << label) - 1) / std::log2(static_cast<bst_float>(idx + 2))) : 0;
-  }
-
-  // Compute the weight adjustment for an item within a group:
-  // pos_pred_pos => Where does the positive label live, had the list been sorted by prediction
-  // neg_pred_pos => Where does the negative label live, had the list been sorted by prediction
-  // pos_label => positive label value from sorted label list
-  // neg_label => negative label value from sorted label list
-  XGBOOST_DEVICE inline static bst_float ComputeDeltaWeight(uint32_t pos_pred_pos,
-                                                            uint32_t neg_pred_pos,
-                                                            int pos_label, int neg_label,
-                                                            float idcg) {
-    float pos_loginv = 1.0f / std::log2(pos_pred_pos + 2.0f);
-    float neg_loginv = 1.0f / std::log2(neg_pred_pos + 2.0f);
-    bst_float original = ((1 << pos_label) - 1) * pos_loginv + ((1 << neg_label) - 1) * neg_loginv;
-    float changed = ((1 << neg_label) - 1) * pos_loginv + ((1 << pos_label) - 1) * neg_loginv;
-    bst_float delta = (original - changed) * (1.0f / idcg);
-    if (delta < 0.0f) delta = - delta;
-    return delta;
-  }
-
-#if defined(__CUDACC__)
-  dh::caching_device_vector<float> dgroup_dcg_;
-  // This computes the adjustment to the weight
-  const NDCGLambdaWeightMultiplier weight_multiplier_;
-#endif
-};
-
 class MAPLambdaWeightComputer
 #if defined(__CUDACC__)
   : public IndexablePredictionSorter
@@ -948,10 +780,6 @@ XGBOOST_REGISTER_OBJECTIVE(PairwiseRankObj, PairwiseLambdaWeightComputer::Name()
 .describe("Pairwise rank objective.")
 .set_body([]() { return new LambdaRankObj<PairwiseLambdaWeightComputer>(); });
 
-XGBOOST_REGISTER_OBJECTIVE(LambdaRankNDCG, NDCGLambdaWeightComputer::Name())
-.describe("LambdaRank with NDCG as objective.")
-.set_body([]() { return new LambdaRankObj<NDCGLambdaWeightComputer>(); });
-
 XGBOOST_REGISTER_OBJECTIVE(LambdaRankObjMAP, MAPLambdaWeightComputer::Name())
 .describe("LambdaRank with MAP as objective.")
 .set_body([]() { return new LambdaRankObj<MAPLambdaWeightComputer>(); });
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index 11cbf6bec..d02a55c1b 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -5,6 +5,7 @@
 
 #include <gtest/gtest.h>                        // for Test, Message, TestPartResult, CmpHel...
 
+#include <algorithm>                            // for sort
 #include <cstddef>                              // for size_t
 #include <initializer_list>                     // for initializer_list
 #include <map>                                  // for map
@@ -13,7 +14,6 @@
 #include <string>                               // for char_traits, basic_string, string
 #include <vector>                               // for vector
 
-#include "../../../src/common/ranking_utils.h"  // for LambdaRankParam
 #include "../../../src/common/ranking_utils.h"  // for NDCGCache, LambdaRankParam
 #include "../helpers.h"                         // for CheckRankingObjFunction, CheckConfigReload
 #include "xgboost/base.h"                       // for GradientPair, bst_group_t, Args
@@ -25,6 +25,126 @@
 #include "xgboost/span.h"                       // for Span
 
 namespace xgboost::obj {
+TEST(LambdaRank, NDCGJsonIO) {
+  Context ctx;
+  TestNDCGJsonIO(&ctx);
+}
+
+void TestNDCGGPair(Context const* ctx) {
+  {
+    std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+    obj->Configure(Args{{"lambdarank_pair_method", "topk"}});
+    CheckConfigReload(obj, "rank:ndcg");
+
+    // No gain in swapping 2 documents.
+    CheckRankingObjFunction(obj,
+                            {1, 1, 1, 1},
+                            {1, 1, 1, 1},
+                            {1.0f, 1.0f},
+                            {0, 2, 4},
+                            {0.0f, -0.0f, 0.0f, 0.0f},
+                            {0.0f, 0.0f, 0.0f, 0.0f});
+  }
+  {
+    std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+    obj->Configure(Args{{"lambdarank_pair_method", "topk"}});
+    // Test with setting sample weight to second query group
+    CheckRankingObjFunction(obj,
+                            {0, 0.1f, 0, 0.1f},
+                            {0,   1, 0, 1},
+                            {2.0f, 0.0f},
+                            {0, 2, 4},
+                            {2.06611f, -2.06611f, 0.0f, 0.0f},
+                            {2.169331f, 2.169331f, 0.0f, 0.0f});
+
+    CheckRankingObjFunction(obj,
+                            {0, 0.1f, 0, 0.1f},
+                            {0,   1, 0, 1},
+                            {2.0f, 2.0f},
+                            {0, 2, 4},
+                            {2.06611f, -2.06611f, 2.06611f, -2.06611f},
+                            {2.169331f, 2.169331f, 2.169331f, 2.169331f});
+  }
+
+  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+  obj->Configure(Args{{"lambdarank_pair_method", "topk"}});
+
+  HostDeviceVector<float> predts{0, 1, 0, 1};
+  MetaInfo info;
+  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
+  info.group_ptr_ = {0, 2, 4};
+  info.num_row_ = 4;
+  HostDeviceVector<GradientPair> gpairs;
+  obj->GetGradient(predts, info, 0, &gpairs);
+  ASSERT_EQ(gpairs.Size(), predts.Size());
+
+  {
+    predts = {1, 0, 1, 0};
+    HostDeviceVector<GradientPair> gpairs;
+    obj->GetGradient(predts, info, 0, &gpairs);
+    for (size_t i = 0; i < gpairs.Size(); ++i) {
+      ASSERT_GT(gpairs.HostSpan()[i].GetHess(), 0);
+    }
+    ASSERT_LT(gpairs.HostSpan()[1].GetGrad(), 0);
+    ASSERT_LT(gpairs.HostSpan()[3].GetGrad(), 0);
+
+    ASSERT_GT(gpairs.HostSpan()[0].GetGrad(), 0);
+    ASSERT_GT(gpairs.HostSpan()[2].GetGrad(), 0);
+
+    info.weights_ = {2, 3};
+    HostDeviceVector<GradientPair> weighted_gpairs;
+    obj->GetGradient(predts, info, 0, &weighted_gpairs);
+    auto const& h_gpairs = gpairs.ConstHostSpan();
+    auto const& h_weighted_gpairs = weighted_gpairs.ConstHostSpan();
+    for (size_t i : {0ul, 1ul}) {
+      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetGrad(), h_gpairs[i].GetGrad() * 2.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetHess(), h_gpairs[i].GetHess() * 2.0f);
+    }
+    for (size_t i : {2ul, 3ul}) {
+      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetGrad(), h_gpairs[i].GetGrad() * 3.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetHess(), h_gpairs[i].GetHess() * 3.0f);
+    }
+  }
+
+  ASSERT_NO_THROW(obj->DefaultEvalMetric());
+}
+
+TEST(LambdaRank, NDCGGPair) {
+  Context ctx;
+  TestNDCGGPair(&ctx);
+}
+
+void TestUnbiasedNDCG(Context const* ctx) {
+  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", ctx)};
+  obj->Configure(Args{{"lambdarank_pair_method", "topk"},
+                      {"lambdarank_unbiased", "true"},
+                      {"lambdarank_bias_norm", "0"}});
+  std::shared_ptr<DMatrix> p_fmat{RandomDataGenerator{10, 1, 0.0f}.GenerateDMatrix(true, false, 2)};
+  auto h_label = p_fmat->Info().labels.HostView().Values();
+  // Move clicked samples to the beginning.
+  std::sort(h_label.begin(), h_label.end(), std::greater<>{});
+  HostDeviceVector<float> predt(p_fmat->Info().num_row_, 1.0f);
+
+  HostDeviceVector<GradientPair> out_gpair;
+  obj->GetGradient(predt, p_fmat->Info(), 0, &out_gpair);
+
+  Json config{Object{}};
+  obj->SaveConfig(&config);
+  auto ti_plus = get<F32Array const>(config["ti+"]);
+  ASSERT_FLOAT_EQ(ti_plus[0], 1.0);
+  // bias is non-increasing when prediction is constant. (constant cost on swapping documents)
+  for (std::size_t i = 1; i < ti_plus.size(); ++i) {
+    ASSERT_LE(ti_plus[i], ti_plus[i - 1]);
+  }
+  auto tj_minus = get<F32Array const>(config["tj-"]);
+  ASSERT_FLOAT_EQ(tj_minus[0], 1.0);
+}
+
+TEST(LambdaRank, UnbiasedNDCG) {
+  Context ctx;
+  TestUnbiasedNDCG(&ctx);
+}
+
 void InitMakePairTest(Context const* ctx, MetaInfo* out_info, HostDeviceVector<float>* out_predt) {
   out_predt->SetDevice(ctx->gpu_id);
   MetaInfo& info = *out_info;
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index 03ccdef8b..01d020dda 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -12,6 +12,18 @@
 #include "test_lambdarank_obj.h"
 
 namespace xgboost::obj {
+TEST(LambdaRank, GPUNDCGJsonIO) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestNDCGJsonIO(&ctx);
+}
+
+TEST(LambdaRank, GPUNDCGGPair) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestNDCGGPair(&ctx);
+}
+
 void TestGPUMakePair() {
   Context ctx;
   ctx.gpu_id = 0;
@@ -107,6 +119,12 @@ void TestGPUMakePair() {
 
 TEST(LambdaRank, GPUMakePair) { TestGPUMakePair(); }
 
+TEST(LambdaRank, GPUUnbiasedNDCG) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestUnbiasedNDCG(&ctx);
+}
+
 template <typename CountFunctor>
 void RankItemCountImpl(std::vector<std::uint32_t> const &sorted_items, CountFunctor f,
                        std::uint32_t find_val, std::uint32_t exp_val) {
diff --git a/tests/cpp/objective/test_lambdarank_obj.h b/tests/cpp/objective/test_lambdarank_obj.h
index 8dd238d2b..aebe3ad54 100644
--- a/tests/cpp/objective/test_lambdarank_obj.h
+++ b/tests/cpp/objective/test_lambdarank_obj.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright (c) 2023, XGBoost Contributors
  */
 #ifndef XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_
 #define XGBOOST_OBJECTIVE_TEST_LAMBDARANK_OBJ_H_
@@ -18,6 +18,25 @@
 #include "../helpers.h"                             // for EmptyDMatrix
 
 namespace xgboost::obj {
+inline void TestNDCGJsonIO(Context const* ctx) {
+  std::unique_ptr<xgboost::ObjFunction> obj{ObjFunction::Create("rank:ndcg", ctx)};
+
+  obj->Configure(Args{});
+  Json j_obj{Object()};
+  obj->SaveConfig(&j_obj);
+
+  ASSERT_EQ(get<String>(j_obj["name"]), "rank:ndcg");
+  auto const& j_param = j_obj["lambdarank_param"];
+
+  ASSERT_EQ(get<String>(j_param["ndcg_exp_gain"]), "1");
+  ASSERT_EQ(get<String>(j_param["lambdarank_num_pair_per_sample"]),
+            std::to_string(ltr::LambdaRankParam::NotSet()));
+}
+
+void TestNDCGGPair(Context const* ctx);
+
+void TestUnbiasedNDCG(Context const* ctx);
+
 /**
  * \brief Initialize test data for make pair tests.
  */
diff --git a/tests/cpp/objective/test_ranking_obj.cc b/tests/cpp/objective/test_ranking_obj.cc
index a007750e3..2072f530e 100644
--- a/tests/cpp/objective/test_ranking_obj.cc
+++ b/tests/cpp/objective/test_ranking_obj.cc
@@ -35,24 +35,6 @@ TEST(Objective, DeclareUnifiedTest(PairwiseRankingGPair)) {
   ASSERT_NO_THROW(obj->DefaultEvalMetric());
 }
 
-TEST(Objective, DeclareUnifiedTest(NDCG_JsonIO)) {
-  xgboost::Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-
-  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", &ctx)};
-
-  obj->Configure(Args{});
-  Json j_obj {Object()};
-  obj->SaveConfig(&j_obj);
-
-  ASSERT_EQ(get<String>(j_obj["name"]), "rank:ndcg");;
-
-  auto const& j_param = j_obj["lambda_rank_param"];
-
-  ASSERT_EQ(get<String>(j_param["num_pairsample"]), "1");
-  ASSERT_EQ(get<String>(j_param["fix_list_weight"]), "0");
-}
-
 TEST(Objective, DeclareUnifiedTest(PairwiseRankingGPairSameLabels)) {
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
@@ -71,33 +53,6 @@ TEST(Objective, DeclareUnifiedTest(PairwiseRankingGPairSameLabels)) {
   ASSERT_NO_THROW(obj->DefaultEvalMetric());
 }
 
-TEST(Objective, DeclareUnifiedTest(NDCGRankingGPair)) {
-  std::vector<std::pair<std::string, std::string>> args;
-  xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:ndcg", &ctx)};
-  obj->Configure(args);
-  CheckConfigReload(obj, "rank:ndcg");
-
-  // Test with setting sample weight to second query group
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {2.0f, 0.0f},
-                          {0, 2, 4},
-                          {0.7f, -0.7f, 0.0f, 0.0f},
-                          {0.74f, 0.74f, 0.0f, 0.0f});
-
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {1.0f, 1.0f},
-                          {0, 2, 4},
-                          {0.35f, -0.35f,  0.35f, -0.35f},
-                          {0.368f, 0.368f, 0.368f, 0.368f});
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
 TEST(Objective, DeclareUnifiedTest(MAPRankingGPair)) {
   std::vector<std::pair<std::string, std::string>> args;
   xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
diff --git a/tests/cpp/objective/test_ranking_obj_gpu.cu b/tests/cpp/objective/test_ranking_obj_gpu.cu
index 540560c1f..cd40b4928 100644
--- a/tests/cpp/objective/test_ranking_obj_gpu.cu
+++ b/tests/cpp/objective/test_ranking_obj_gpu.cu
@@ -89,62 +89,6 @@ TEST(Objective, RankSegmentSorterAscendingTest) {
                                                      5, 4, 6});
 }
 
-TEST(Objective, NDCGLambdaWeightComputerTest) {
-  std::vector<float> hlabels = {3.1f, 1.2f, 2.3f, 4.4f,        // Labels
-                                7.8f, 5.01f, 6.96f,
-                                10.3f, 8.7f, 11.4f, 9.45f, 11.4f};
-  dh::device_vector<bst_float> dlabels(hlabels);
-
-  auto segment_label_sorter = RankSegmentSorterTestImpl<float>(
-    {0, 4, 7, 12},                  // Groups
-    hlabels,
-    {4.4f, 3.1f, 2.3f, 1.2f,        // Expected sorted labels
-     7.8f, 6.96f, 5.01f,
-     11.4f, 11.4f, 10.3f, 9.45f, 8.7f},
-    {3, 0, 2, 1,                    // Expected original positions
-     4, 6, 5,
-     9, 11, 7, 10, 8});
-
-  // Created segmented predictions for the labels from above
-  std::vector<bst_float> hpreds{-9.78f, 24.367f, 0.908f, -11.47f,
-                                -1.03f, -2.79f, -3.1f,
-                                104.22f, 103.1f, -101.7f, 100.5f, 45.1f};
-  dh::device_vector<bst_float> dpreds(hpreds);
-
-  xgboost::obj::NDCGLambdaWeightComputer ndcg_lw_computer(dpreds.data().get(),
-                                                          dlabels.data().get(),
-                                                          *segment_label_sorter);
-
-  // Where will the predictions move from its current position, if they were sorted
-  // descendingly?
-  auto dsorted_pred_pos = ndcg_lw_computer.GetPredictionSorter().GetIndexableSortedPositionsSpan();
-  std::vector<uint32_t> hsorted_pred_pos(segment_label_sorter->GetNumItems());
-  dh::CopyDeviceSpanToVector(&hsorted_pred_pos, dsorted_pred_pos);
-  std::vector<uint32_t> expected_sorted_pred_pos{2, 0, 1, 3,
-                                                 4, 5, 6,
-                                                 7, 8, 11, 9, 10};
-  EXPECT_EQ(expected_sorted_pred_pos, hsorted_pred_pos);
-
-  // Check group DCG values
-  std::vector<float> hgroup_dcgs(segment_label_sorter->GetNumGroups());
-  dh::CopyDeviceSpanToVector(&hgroup_dcgs, ndcg_lw_computer.GetGroupDcgsSpan());
-  std::vector<uint32_t> hgroups(segment_label_sorter->GetNumGroups() + 1);
-  dh::CopyDeviceSpanToVector(&hgroups, segment_label_sorter->GetGroupsSpan());
-  EXPECT_EQ(hgroup_dcgs.size(), segment_label_sorter->GetNumGroups());
-  std::vector<float> hsorted_labels(segment_label_sorter->GetNumItems());
-  dh::CopyDeviceSpanToVector(&hsorted_labels, segment_label_sorter->GetItemsSpan());
-  for (size_t i = 0; i < hgroup_dcgs.size(); ++i) {
-    // Compute group DCG value on CPU and compare
-    auto gbegin = hgroups[i];
-    auto gend = hgroups[i + 1];
-    EXPECT_NEAR(
-      hgroup_dcgs[i],
-      xgboost::obj::NDCGLambdaWeightComputer::ComputeGroupDCGWeight(&hsorted_labels[gbegin],
-                                                                    gend - gbegin),
-      0.01f);
-  }
-}
-
 TEST(Objective, IndexableSortedItemsTest) {
   std::vector<float> hlabels = {3.1f, 1.2f, 2.3f, 4.4f,        // Labels
                                 7.8f, 5.01f, 6.96f,
diff --git a/tests/python-gpu/test_gpu_eval_metrics.py b/tests/python-gpu/test_gpu_eval_metrics.py
index 6d16aa44e..1e9d1a282 100644
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -1,3 +1,4 @@
+import json
 import sys
 
 import pytest
@@ -36,19 +37,16 @@ class TestGPUEvalMetrics:
 
         Xy = xgboost.DMatrix(X, y, group=group)
 
-        cpu = xgboost.train(
+        booster = xgboost.train(
             {"tree_method": "hist", "eval_metric": "auc", "objective": "rank:ndcg"},
             Xy,
             num_boost_round=10,
         )
-        cpu_auc = float(cpu.eval(Xy).split(":")[1])
-
-        gpu = xgboost.train(
-            {"tree_method": "gpu_hist", "eval_metric": "auc", "objective": "rank:ndcg"},
-            Xy,
-            num_boost_round=10,
-        )
-        gpu_auc = float(gpu.eval(Xy).split(":")[1])
+        cpu_auc = float(booster.eval(Xy).split(":")[1])
+        booster.set_param({"gpu_id": "0"})
+        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        gpu_auc = float(booster.eval(Xy).split(":")[1])
+        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
 
         np.testing.assert_allclose(cpu_auc, gpu_auc)
 

From 42d100de188d6f1df29621f8f72c4e429bbfee22 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 19 Apr 2023 00:39:11 -0700
Subject: [PATCH 03/34] Make sure metrics work with federated learning (#9037)

---
 src/collective/aggregator.h                |  62 ++++++
 src/learner.cc                             |  43 +---
 src/metric/auc.cc                          |   4 +-
 src/metric/metric_common.h                 |   9 +-
 src/metric/rank_metric.cc                  |  28 ++-
 tests/cpp/helpers.cc                       |   4 +-
 tests/cpp/metric/test_survival_metric.cu   |  99 +--------
 tests/cpp/metric/test_survival_metric.h    | 107 +++++++++
 tests/cpp/plugin/helpers.h                 |   2 +-
 tests/cpp/plugin/test_federated_learner.cc |   2 +-
 tests/cpp/plugin/test_federated_metrics.cc | 243 +++++++++++++++++++++
 11 files changed, 451 insertions(+), 152 deletions(-)
 create mode 100644 src/collective/aggregator.h
 create mode 100644 tests/cpp/metric/test_survival_metric.h
 create mode 100644 tests/cpp/plugin/test_federated_metrics.cc

diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
new file mode 100644
index 000000000..ee499b4d1
--- /dev/null
+++ b/src/collective/aggregator.h
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ *
+ * Higher level functions built on top the Communicator API, taking care of behavioral differences
+ * between row-split vs column-split distributed training, and horizontal vs vertical federated
+ * learning.
+ */
+#pragma once
+#include <xgboost/data.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "communicator-inl.h"
+
+namespace xgboost {
+namespace collective {
+
+/**
+ * @brief Apply the given function where the labels are.
+ *
+ * Normally all the workers have access to the labels, so the function is just applied locally. In
+ * vertical federated learning, we assume labels are only available on worker 0, so the function is
+ * applied there, with the results broadcast to other workers.
+ *
+ * @tparam Function The function used to calculate the results.
+ * @tparam Args Arguments to the function.
+ * @param info MetaInfo about the DMatrix.
+ * @param buffer The buffer storing the results.
+ * @param size The size of the buffer.
+ * @param function The function used to calculate the results.
+ * @param args Arguments to the function.
+ */
+template <typename Function, typename... Args>
+void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function,
+                     Args&&... args) {
+  if (info.IsVerticalFederated()) {
+    // We assume labels are only available on worker 0, so the calculation is done there and result
+    // broadcast to other workers.
+    std::vector<char> message(1024);
+    if (collective::GetRank() == 0) {
+      try {
+        std::forward<Function>(function)(std::forward<Args>(args)...);
+      } catch (dmlc::Error& e) {
+        strncpy(&message[0], e.what(), message.size());
+        message.back() = '\0';
+      }
+    }
+    collective::Broadcast(&message[0], message.size(), 0);
+    if (strlen(&message[0]) == 0) {
+      collective::Broadcast(buffer, size, 0);
+    } else {
+      LOG(FATAL) << &message[0];
+    }
+  } else {
+    std::forward<Function>(function)(std::forward<Args>(args)...);
+  }
+}
+
+}  // namespace collective
+}  // namespace xgboost
diff --git a/src/learner.cc b/src/learner.cc
index 1150a2355..78297404b 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -34,6 +34,7 @@
 #include <utility>                        // for pair, as_const, move, swap
 #include <vector>                         // for vector
 
+#include "collective/aggregator.h"        // for ApplyWithLabels
 #include "collective/communicator-inl.h"  // for Allreduce, Broadcast, GetRank, IsDistributed
 #include "collective/communicator.h"      // for Operation
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
@@ -859,22 +860,10 @@ class LearnerConfiguration : public Learner {
   }
 
   void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
-    // Special handling for vertical federated learning.
-    if (info.IsVerticalFederated()) {
-      // We assume labels are only available on worker 0, so the estimation is calculated there
-      // and broadcast to other workers.
-      if (collective::GetRank() == 0) {
-        UsePtr(obj_)->InitEstimation(info, base_score);
-        collective::Broadcast(base_score->Data()->HostPointer(),
-                              sizeof(bst_float) * base_score->Size(), 0);
-      } else {
-        base_score->Reshape(1);
-        collective::Broadcast(base_score->Data()->HostPointer(),
-                              sizeof(bst_float) * base_score->Size(), 0);
-      }
-    } else {
-      UsePtr(obj_)->InitEstimation(info, base_score);
-    }
+    base_score->Reshape(1);
+    collective::ApplyWithLabels(info, base_score->Data()->HostPointer(),
+                                sizeof(bst_float) * base_score->Size(),
+                                [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
   }
 };
 
@@ -1486,24 +1475,10 @@ class LearnerImpl : public LearnerIO {
  private:
   void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
                    HostDeviceVector<GradientPair>* out_gpair) {
-    // Special handling for vertical federated learning.
-    if (info.IsVerticalFederated()) {
-      // We assume labels are only available on worker 0, so the gradients are calculated there
-      // and broadcast to other workers.
-      if (collective::GetRank() == 0) {
-        obj_->GetGradient(preds, info, iteration, out_gpair);
-        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
-                              0);
-      } else {
-        CHECK_EQ(info.labels.Size(), 0)
-            << "In vertical federated learning, labels should only be on the first worker";
-        out_gpair->Resize(preds.Size());
-        collective::Broadcast(out_gpair->HostPointer(), out_gpair->Size() * sizeof(GradientPair),
-                              0);
-      }
-    } else {
-      obj_->GetGradient(preds, info, iteration, out_gpair);
-    }
+    out_gpair->Resize(preds.Size());
+    collective::ApplyWithLabels(info, out_gpair->HostPointer(),
+                                out_gpair->Size() * sizeof(GradientPair),
+                                [&] { obj_->GetGradient(preds, info, iteration, out_gpair); });
   }
 
   /*! \brief random number transformation seed. */
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 2d4becfa8..bde3127ed 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -270,7 +270,9 @@ class EvalAUC : public MetricNoCache {
     }
     //  We use the global size to handle empty dataset.
     std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
-    collective::Allreduce<collective::Operation::kMax>(meta.data(), meta.size());
+    if (!info.IsVerticalFederated()) {
+      collective::Allreduce<collective::Operation::kMax>(meta.data(), meta.size());
+    }
     if (meta[0] == 0) {
       // Empty across all workers, which is not supported.
       auc = std::numeric_limits<double>::quiet_NaN();
diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h
index 5fbd6f256..a6fad7158 100644
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -9,6 +9,8 @@
 #include <memory>  // shared_ptr
 #include <string>
 
+#include "../collective/aggregator.h"
+#include "../collective/communicator-inl.h"
 #include "../common/common.h"
 #include "xgboost/metric.h"
 
@@ -20,7 +22,12 @@ class MetricNoCache : public Metric {
   virtual double Eval(HostDeviceVector<float> const &predts, MetaInfo const &info) = 0;
 
   double Evaluate(HostDeviceVector<float> const &predts, std::shared_ptr<DMatrix> p_fmat) final {
-    return this->Eval(predts, p_fmat->Info());
+    double result{0.0};
+    auto const& info = p_fmat->Info();
+    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+      result = this->Eval(predts, info);
+    });
+    return result;
   }
 };
 
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 62efd0876..000b88e80 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -28,9 +28,8 @@
 #include <algorithm>                         // for stable_sort, copy, fill_n, min, max
 #include <array>                             // for array
 #include <cmath>                             // for log, sqrt
-#include <cstddef>                           // for size_t, std
-#include <cstdint>                           // for uint32_t
 #include <functional>                        // for less, greater
+#include <limits>                            // for numeric_limits
 #include <map>                               // for operator!=, _Rb_tree_const_iterator
 #include <memory>                            // for allocator, unique_ptr, shared_ptr, __shared_...
 #include <numeric>                           // for accumulate
@@ -39,15 +38,11 @@
 #include <utility>                           // for pair, make_pair
 #include <vector>                            // for vector
 
-#include "../collective/communicator-inl.h"  // for IsDistributed, Allreduce
-#include "../collective/communicator.h"      // for Operation
+#include "../collective/aggregator.h"        // for ApplyWithLabels
 #include "../common/algorithm.h"             // for ArgSort, Sort
 #include "../common/linalg_op.h"             // for cbegin, cend
 #include "../common/math.h"                  // for CmpFirst
 #include "../common/optional_weight.h"       // for OptionalWeights, MakeOptionalWeights
-#include "../common/ranking_utils.h"         // for LambdaRankParam, NDCGCache, ParseMetricName
-#include "../common/threading_utils.h"       // for ParallelFor
-#include "../common/transform_iterator.h"    // for IndexTransformIter
 #include "dmlc/common.h"                     // for OMPException
 #include "metric_common.h"                   // for MetricNoCache, GPUMetric, PackedReduceResult
 #include "xgboost/base.h"                    // for bst_float, bst_omp_uint, bst_group_t, Args
@@ -59,7 +54,6 @@
 #include "xgboost/linalg.h"                  // for Tensor, TensorView, Range, VectorView, MakeT...
 #include "xgboost/logging.h"                 // for CHECK, ConsoleLogger, LOG_INFO, CHECK_EQ
 #include "xgboost/metric.h"                  // for MetricReg, XGBOOST_REGISTER_METRIC, Metric
-#include "xgboost/span.h"                    // for Span, operator!=
 #include "xgboost/string_view.h"             // for StringView
 
 namespace {
@@ -385,15 +379,19 @@ class EvalRankWithCache : public Metric {
   }
 
   double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
+    double result{0.0};
     auto const& info = p_fmat->Info();
-    auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
-    if (p_cache->Param() != param_) {
-      p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
-    }
-    CHECK(p_cache->Param() == param_);
-    CHECK_EQ(preds.Size(), info.labels.Size());
+    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+      auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
+      if (p_cache->Param() != param_) {
+        p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
+      }
+      CHECK(p_cache->Param() == param_);
+      CHECK_EQ(preds.Size(), info.labels.Size());
 
-    return this->Eval(preds, info, p_cache);
+      result = this->Eval(preds, info, p_cache);
+    });
+    return result;
   }
 
   virtual double Eval(HostDeviceVector<float> const& preds, MetaInfo const& info,
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index a8b974f03..76fd2f967 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -189,7 +189,9 @@ double GetMultiMetricEval(xgboost::Metric* metric,
   info.weights_.HostVector() = weights;
   info.group_ptr_ = groups;
   info.data_split_mode = data_split_mode;
-
+  if (info.IsVerticalFederated() && xgboost::collective::GetRank() != 0) {
+    info.labels.Reshape(0);
+  }
   return metric->Evaluate(preds, p_fmat);
 }
 
diff --git a/tests/cpp/metric/test_survival_metric.cu b/tests/cpp/metric/test_survival_metric.cu
index d7ac54860..723f306e4 100644
--- a/tests/cpp/metric/test_survival_metric.cu
+++ b/tests/cpp/metric/test_survival_metric.cu
@@ -2,109 +2,13 @@
  * Copyright (c) by Contributors 2020
  */
 #include <gtest/gtest.h>
-#include <cmath>
+#include "test_survival_metric.h"
 #include "xgboost/metric.h"
-#include "../helpers.h"
-#include "../../../src/common/survival_util.h"
 
 /** Tests for Survival metrics that should run both on CPU and GPU **/
 
 namespace xgboost {
 namespace common {
-namespace {
-inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
-  auto ctx = CreateEmptyGenericParam(device);
-  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
-  metric->Configure(Args{});
-
-  HostDeviceVector<float> predts;
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  auto &h_predts = predts.HostVector();
-
-  SimpleLCG lcg;
-  SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
-
-  size_t n_samples = 2048;
-  h_predts.resize(n_samples);
-
-  for (size_t i = 0; i < n_samples; ++i) {
-    h_predts[i] = dist(&lcg);
-  }
-
-  auto &h_upper = info.labels_upper_bound_.HostVector();
-  auto &h_lower = info.labels_lower_bound_.HostVector();
-  h_lower.resize(n_samples);
-  h_upper.resize(n_samples);
-  for (size_t i = 0; i < n_samples; ++i) {
-    h_lower[i] = 1;
-    h_upper[i] = 10;
-  }
-
-  auto result = metric->Evaluate(predts, p_fmat);
-  for (size_t i = 0; i < 8; ++i) {
-    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
-  }
-}
-
-void VerifyAFTNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  /**
-   * Test aggregate output from the AFT metric over a small test data set.
-   * This is unlike AFTLoss.* tests, which verify metric values over individual data points.
-   **/
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.num_row_ = 4;
-  info.labels_lower_bound_.HostVector()
-      = { 100.0f, 0.0f, 60.0f, 16.0f };
-  info.labels_upper_bound_.HostVector()
-      = { 100.0f, 20.0f, std::numeric_limits<bst_float>::infinity(), 200.0f };
-  info.weights_.HostVector() = std::vector<bst_float>();
-  info.data_split_mode = data_split_mode;
-  HostDeviceVector<bst_float> preds(4, std::log(64));
-
-  struct TestCase {
-    std::string dist_type;
-    bst_float reference_value;
-  };
-  for (const auto& test_case : std::vector<TestCase>{ {"normal", 2.1508f}, {"logistic", 2.1804f},
-                                                     {"extreme", 2.0706f} }) {
-    std::unique_ptr<Metric> metric(Metric::Create("aft-nloglik", &ctx));
-    metric->Configure({ {"aft_loss_distribution", test_case.dist_type},
-                       {"aft_loss_distribution_scale", "1.0"} });
-    EXPECT_NEAR(metric->Evaluate(preds, p_fmat), test_case.reference_value, 1e-4);
-  }
-}
-
-void VerifyIntervalRegressionAccuracy(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  auto p_fmat = EmptyDMatrix();
-  MetaInfo& info = p_fmat->Info();
-  info.num_row_ = 4;
-  info.labels_lower_bound_.HostVector() = { 20.0f, 0.0f, 60.0f, 16.0f };
-  info.labels_upper_bound_.HostVector() = { 80.0f, 20.0f, 80.0f, 200.0f };
-  info.weights_.HostVector() = std::vector<bst_float>();
-  info.data_split_mode = data_split_mode;
-  HostDeviceVector<bst_float> preds(4, std::log(60.0f));
-
-  std::unique_ptr<Metric> metric(Metric::Create("interval-regression-accuracy", &ctx));
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.75f);
-  info.labels_lower_bound_.HostVector()[2] = 70.0f;
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
-  info.labels_upper_bound_.HostVector()[2] = std::numeric_limits<bst_float>::infinity();
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
-  info.labels_upper_bound_.HostVector()[3] = std::numeric_limits<bst_float>::infinity();
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
-  info.labels_lower_bound_.HostVector()[0] = 70.0f;
-  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.25f);
-
-  CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GPUIDX);
-}
-}  // anonymous namespace
-
 TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) { VerifyAFTNegLogLik(); }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikRowSplit) {
@@ -140,6 +44,5 @@ TEST(AFTNegLogLikMetric, DeclareUnifiedTest(Configuration)) {
 
   CheckDeterministicMetricElementWise(StringView{"aft-nloglik"}, GPUIDX);
 }
-
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_survival_metric.h b/tests/cpp/metric/test_survival_metric.h
new file mode 100644
index 000000000..75414733d
--- /dev/null
+++ b/tests/cpp/metric/test_survival_metric.h
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
+ */
+#pragma once
+#include <gtest/gtest.h>
+
+#include <cmath>
+
+#include "../../../src/common/survival_util.h"
+#include "../helpers.h"
+#include "xgboost/metric.h"
+
+namespace xgboost {
+namespace common {
+inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
+  auto ctx = CreateEmptyGenericParam(device);
+  std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
+  metric->Configure(Args{});
+
+  HostDeviceVector<float> predts;
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  auto &h_predts = predts.HostVector();
+
+  SimpleLCG lcg;
+  SimpleRealUniformDistribution<float> dist{0.0f, 1.0f};
+
+  size_t n_samples = 2048;
+  h_predts.resize(n_samples);
+
+  for (size_t i = 0; i < n_samples; ++i) {
+    h_predts[i] = dist(&lcg);
+  }
+
+  auto &h_upper = info.labels_upper_bound_.HostVector();
+  auto &h_lower = info.labels_lower_bound_.HostVector();
+  h_lower.resize(n_samples);
+  h_upper.resize(n_samples);
+  for (size_t i = 0; i < n_samples; ++i) {
+    h_lower[i] = 1;
+    h_upper[i] = 10;
+  }
+
+  auto result = metric->Evaluate(predts, p_fmat);
+  for (size_t i = 0; i < 8; ++i) {
+    ASSERT_EQ(metric->Evaluate(predts, p_fmat), result);
+  }
+}
+
+inline void VerifyAFTNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  /**
+   * Test aggregate output from the AFT metric over a small test data set.
+   * This is unlike AFTLoss.* tests, which verify metric values over individual data points.
+   **/
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.num_row_ = 4;
+  info.labels_lower_bound_.HostVector()
+      = { 100.0f, 0.0f, 60.0f, 16.0f };
+  info.labels_upper_bound_.HostVector()
+      = { 100.0f, 20.0f, std::numeric_limits<bst_float>::infinity(), 200.0f };
+  info.weights_.HostVector() = std::vector<bst_float>();
+  info.data_split_mode = data_split_mode;
+  HostDeviceVector<bst_float> preds(4, std::log(64));
+
+  struct TestCase {
+    std::string dist_type;
+    bst_float reference_value;
+  };
+  for (const auto& test_case : std::vector<TestCase>{ {"normal", 2.1508f}, {"logistic", 2.1804f},
+                                                     {"extreme", 2.0706f} }) {
+    std::unique_ptr<Metric> metric(Metric::Create("aft-nloglik", &ctx));
+    metric->Configure({ {"aft_loss_distribution", test_case.dist_type},
+                       {"aft_loss_distribution_scale", "1.0"} });
+    EXPECT_NEAR(metric->Evaluate(preds, p_fmat), test_case.reference_value, 1e-4);
+  }
+}
+
+inline void VerifyIntervalRegressionAccuracy(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
+
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  info.num_row_ = 4;
+  info.labels_lower_bound_.HostVector() = { 20.0f, 0.0f, 60.0f, 16.0f };
+  info.labels_upper_bound_.HostVector() = { 80.0f, 20.0f, 80.0f, 200.0f };
+  info.weights_.HostVector() = std::vector<bst_float>();
+  info.data_split_mode = data_split_mode;
+  HostDeviceVector<bst_float> preds(4, std::log(60.0f));
+
+  std::unique_ptr<Metric> metric(Metric::Create("interval-regression-accuracy", &ctx));
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.75f);
+  info.labels_lower_bound_.HostVector()[2] = 70.0f;
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
+  info.labels_upper_bound_.HostVector()[2] = std::numeric_limits<bst_float>::infinity();
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
+  info.labels_upper_bound_.HostVector()[3] = std::numeric_limits<bst_float>::infinity();
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.50f);
+  info.labels_lower_bound_.HostVector()[0] = 70.0f;
+  EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.25f);
+
+  CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GPUIDX);
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index 10ba68b49..41e5a63e5 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -65,7 +65,7 @@ class BaseFederatedTest : public ::testing::Test {
 
   void TearDown() override { server_.reset(nullptr); }
 
-  static int const kWorldSize{3};
+  static int constexpr kWorldSize{3};
   std::unique_ptr<ServerForTest> server_;
 };
 
diff --git a/tests/cpp/plugin/test_federated_learner.cc b/tests/cpp/plugin/test_federated_learner.cc
index 85d0a2b7d..b7066b6a0 100644
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/test_federated_learner.cc
@@ -70,7 +70,7 @@ void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json e
 
 class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
   std::unique_ptr<ServerForTest> server_;
-  static int const kWorldSize{3};
+  static int constexpr kWorldSize{3};
 
  protected:
   void SetUp() override { server_ = std::make_unique<ServerForTest>(kWorldSize); }
diff --git a/tests/cpp/plugin/test_federated_metrics.cc b/tests/cpp/plugin/test_federated_metrics.cc
new file mode 100644
index 000000000..1bdda567f
--- /dev/null
+++ b/tests/cpp/plugin/test_federated_metrics.cc
@@ -0,0 +1,243 @@
+/*!
+ * Copyright 2023 XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../metric/test_auc.h"
+#include "../metric/test_elementwise_metric.h"
+#include "../metric/test_multiclass_metric.h"
+#include "../metric/test_rank_metric.h"
+#include "../metric/test_survival_metric.h"
+#include "helpers.h"
+
+namespace {
+class FederatedMetricTest : public xgboost::BaseFederatedTest {};
+}  // anonymous namespace
+
+namespace xgboost {
+namespace metric {
+TEST_F(FederatedMetricTest, BinaryAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyBinaryAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, BinaryAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyBinaryAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiClassAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiClassAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, RankingAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRankingAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, RankingAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRankingAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, PRAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPRAUC, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, PRAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPRAUC, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiClassPRAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassPRAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiClassPRAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassPRAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, RankingPRAUCRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRankingPRAUC,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, RankingPRAUCColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRankingPRAUC,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, RMSERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRMSE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, RMSEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRMSE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, RMSLERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRMSLE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, RMSLEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyRMSLE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MAERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MAEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MAPERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAPE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MAPEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAPE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MPHERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMPHE, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MPHEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMPHE, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, LogLossRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyLogLoss, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, LogLossColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyLogLoss, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, ErrorRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyError, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, ErrorColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyError, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, PoissonNegLogLikRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPoissonNegLogLik,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, PoissonNegLogLikColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPoissonNegLogLik,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiRMSERowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiRMSE,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiRMSEColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiRMSE,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, QuantileRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyQuantile,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, QuantileColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyQuantile,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiClassErrorRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassError,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiClassErrorColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassError,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MultiClassLogLossRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassLogLoss,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MultiClassLogLossColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMultiClassLogLoss,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, PrecisionRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPrecision,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, PrecisionColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyPrecision,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, NDCGRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyNDCG, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, NDCGColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyNDCG, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, MAPRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAP, DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, MAPColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyMAP, DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, NDCGExpGainRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyNDCGExpGain,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, NDCGExpGainColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyNDCGExpGain,
+                               DataSplitMode::kCol);
+}
+}  // namespace metric
+}  // namespace xgboost
+
+namespace xgboost {
+namespace common {
+TEST_F(FederatedMetricTest, AFTNegLogLikRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAFTNegLogLik,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, AFTNegLogLikColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAFTNegLogLik,
+                               DataSplitMode::kCol);
+}
+
+TEST_F(FederatedMetricTest, IntervalRegressionAccuracyRowSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyIntervalRegressionAccuracy,
+                               DataSplitMode::kRow);
+}
+
+TEST_F(FederatedMetricTest, IntervalRegressionAccuracyColumnSplit) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyIntervalRegressionAccuracy,
+                               DataSplitMode::kCol);
+}
+}  // namespace common
+}  // namespace xgboost

From 564df59204f75a305797fa0dbf70a635c797fd47 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 20 Apr 2023 16:29:35 +0800
Subject: [PATCH 04/34] [breaking] [jvm-packages] Remove scala-implemented
 tracker. (#9045)

---
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  |  20 +-
 .../spark/CommunicatorRobustnessSuite.scala   | 121 +----
 .../xgboost4j/scala/rabit/RabitTracker.scala  | 195 --------
 .../rabit/handler/RabitTrackerHandler.scala   | 361 --------------
 .../rabit/handler/RabitWorkerHandler.scala    | 467 ------------------
 .../xgboost4j/scala/rabit/util/LinkMap.scala  | 136 -----
 .../rabit/util/RabitTrackerHelpers.scala      |  39 --
 .../RabitTrackerConnectionHandlerTest.scala   | 255 ----------
 8 files changed, 9 insertions(+), 1585 deletions(-)
 delete mode 100644 jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTracker.scala
 delete mode 100644 jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala
 delete mode 100644 jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitWorkerHandler.scala
 delete mode 100644 jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/LinkMap.scala
 delete mode 100644 jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/RabitTrackerHelpers.scala
 delete mode 100644 jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTrackerConnectionHandlerTest.scala

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 281997295..0aeae791a 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@ import scala.util.Random
 import scala.collection.JavaConverters._
 
 import ml.dmlc.xgboost4j.java.{Communicator, IRabitTracker, XGBoostError, RabitTracker => PyRabitTracker}
-import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
 import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams
 import ml.dmlc.xgboost4j.scala.ExternalCheckpointManager
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
@@ -44,21 +43,16 @@ import org.apache.spark.sql.SparkSession
  *                                Use a finite, non-zero timeout value to prevent tracker from
  *                                hanging indefinitely (in milliseconds)
  *                                (supported by "scala" implementation only.)
- * @param trackerImpl Choice between "python" or "scala". The former utilizes the Java wrapper of
- *                    the Python Rabit tracker (in dmlc_core), whereas the latter is implemented
- *                    in Scala without Python components, and with full support of timeouts.
- *                    The Scala implementation is currently experimental, use at your own risk.
- *
  * @param hostIp The Rabit Tracker host IP address which is only used for python implementation.
  *               This is only needed if the host IP cannot be automatically guessed.
  * @param pythonExec The python executed path for Rabit Tracker,
  *                   which is only used for python implementation.
  */
-case class TrackerConf(workerConnectionTimeout: Long, trackerImpl: String,
+case class TrackerConf(workerConnectionTimeout: Long,
   hostIp: String = "", pythonExec: String = "")
 
 object TrackerConf {
-  def apply(): TrackerConf = TrackerConf(0L, "python")
+  def apply(): TrackerConf = TrackerConf(0L)
 }
 
 private[scala] case class XGBoostExecutionEarlyStoppingParams(numEarlyStoppingRounds: Int,
@@ -349,11 +343,9 @@ object XGBoost extends Serializable {
 
   /** visiable for testing */
   private[scala] def getTracker(nWorkers: Int, trackerConf: TrackerConf): IRabitTracker = {
-    val tracker: IRabitTracker = trackerConf.trackerImpl match {
-      case "scala" => new RabitTracker(nWorkers)
-      case "python" => new PyRabitTracker(nWorkers, trackerConf.hostIp, trackerConf.pythonExec)
-      case _ => new PyRabitTracker(nWorkers)
-    }
+    val tracker: IRabitTracker = new PyRabitTracker(
+      nWorkers, trackerConf.hostIp, trackerConf.pythonExec
+    )
     tracker
   }
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
index 579e3dd37..04081c3fe 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
@@ -22,7 +22,6 @@ import scala.util.Random
 
 import ml.dmlc.xgboost4j.java.{Communicator, RabitTracker => PyRabitTracker}
 import ml.dmlc.xgboost4j.java.IRabitTracker.TrackerStatus
-import ml.dmlc.xgboost4j.scala.rabit.{RabitTracker => ScalaRabitTracker}
 import ml.dmlc.xgboost4j.scala.DMatrix
 import org.scalatest.FunSuite
 
@@ -40,7 +39,7 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
 
     val paramMap = Map(
       "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(0L, "python", hostIp))
+      "tracker_conf" -> TrackerConf(0L, hostIp))
     val xgbExecParams = getXGBoostExecutionParams(paramMap)
     val tracker = XGBoost.getTracker(xgbExecParams.numWorkers, xgbExecParams.trackerConf)
     tracker match {
@@ -53,7 +52,7 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
 
     val paramMap1 = Map(
       "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(0L, "python", "", pythonExec))
+      "tracker_conf" -> TrackerConf(0L, "", pythonExec))
     val xgbExecParams1 = getXGBoostExecutionParams(paramMap1)
     val tracker1 = XGBoost.getTracker(xgbExecParams1.numWorkers, xgbExecParams1.trackerConf)
     tracker1 match {
@@ -66,7 +65,7 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
 
     val paramMap2 = Map(
       "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(0L, "python", hostIp, pythonExec))
+      "tracker_conf" -> TrackerConf(0L, hostIp, pythonExec))
     val xgbExecParams2 = getXGBoostExecutionParams(paramMap2)
     val tracker2 = XGBoost.getTracker(xgbExecParams2.numWorkers, xgbExecParams2.trackerConf)
     tracker2 match {
@@ -78,58 +77,6 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
     }
   }
 
-  test("training with Scala-implemented Rabit tracker") {
-    val eval = new EvalError()
-    val training = buildDataFrame(Classification.train)
-    val testDM = new DMatrix(Classification.test.iterator)
-    val paramMap = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "binary:logistic", "num_round" -> 5, "num_workers" -> numWorkers,
-      "tracker_conf" -> TrackerConf(60 * 60 * 1000, "scala"))
-    val model = new XGBoostClassifier(paramMap).fit(training)
-    assert(eval.eval(model._booster.predict(testDM, outPutMargin = true), testDM) < 0.1)
-  }
-
-  test("test Communicator allreduce to validate Scala-implemented Rabit tracker") {
-    val vectorLength = 100
-    val rdd = sc.parallelize(
-      (1 to numWorkers * vectorLength).toArray.map { _ => Random.nextFloat() }, numWorkers).cache()
-
-    val tracker = new ScalaRabitTracker(numWorkers)
-    tracker.start(0)
-    val trackerEnvs = tracker.getWorkerEnvs
-    val collectedAllReduceResults = new LinkedBlockingDeque[Array[Float]]()
-
-    val rawData = rdd.mapPartitions { iter =>
-      Iterator(iter.toArray)
-    }.collect()
-
-    val maxVec = (0 until vectorLength).toArray.map { j =>
-      (0 until numWorkers).toArray.map { i => rawData(i)(j) }.max
-    }
-
-    val allReduceResults = rdd.mapPartitions { iter =>
-      Communicator.init(trackerEnvs)
-      val arr = iter.toArray
-      val results = Communicator.allReduce(arr, Communicator.OpType.MAX)
-      Communicator.shutdown()
-      Iterator(results)
-    }.cache()
-
-    val sparkThread = new Thread() {
-      override def run(): Unit = {
-        allReduceResults.foreachPartition(() => _)
-        val byPartitionResults = allReduceResults.collect()
-        assert(byPartitionResults(0).length == vectorLength)
-        collectedAllReduceResults.put(byPartitionResults(0))
-      }
-    }
-    sparkThread.start()
-    assert(tracker.waitFor(0L) == 0)
-    sparkThread.join()
-
-    assert(collectedAllReduceResults.poll().sameElements(maxVec))
-  }
-
   test("test Java RabitTracker wrapper's exception handling: it should not hang forever.") {
     /*
       Deliberately create new instances of SparkContext in each unit test to avoid reusing the
@@ -193,68 +140,6 @@ class CommunicatorRobustnessSuite extends FunSuite with PerTest {
     assert(tracker.waitFor(0) != 0)
   }
 
-  test("test Scala RabitTracker's exception handling: it should not hang forever.") {
-    val rdd = sc.parallelize(1 to numWorkers, numWorkers).cache()
-
-    val tracker = new ScalaRabitTracker(numWorkers)
-    tracker.start(0)
-    val trackerEnvs = tracker.getWorkerEnvs
-
-    val workerCount: Int = numWorkers
-    val dummyTasks = rdd.mapPartitions { iter =>
-      Communicator.init(trackerEnvs)
-      val index = iter.next()
-      Thread.sleep(100 + index * 10)
-      if (index == workerCount) {
-        // kill the worker by throwing an exception
-        throw new RuntimeException("Worker exception.")
-      }
-      Communicator.shutdown()
-      Iterator(index)
-    }.cache()
-
-    val sparkThread = new Thread() {
-      override def run(): Unit = {
-        // forces a Spark job.
-        dummyTasks.foreachPartition(() => _)
-      }
-    }
-    sparkThread.setUncaughtExceptionHandler(tracker)
-    sparkThread.start()
-    assert(tracker.waitFor(0L) == TrackerStatus.FAILURE.getStatusCode)
-  }
-
-  test("test Scala RabitTracker's workerConnectionTimeout") {
-    val rdd = sc.parallelize(1 to numWorkers, numWorkers).cache()
-
-    val tracker = new ScalaRabitTracker(numWorkers)
-    tracker.start(500)
-    val trackerEnvs = tracker.getWorkerEnvs
-
-    val dummyTasks = rdd.mapPartitions { iter =>
-      val index = iter.next()
-      // simulate that the first worker cannot connect to tracker due to network issues.
-      if (index != 1) {
-        Communicator.init(trackerEnvs)
-        Thread.sleep(1000)
-        Communicator.shutdown()
-      }
-
-      Iterator(index)
-    }.cache()
-
-    val sparkThread = new Thread() {
-      override def run(): Unit = {
-        // forces a Spark job.
-        dummyTasks.foreachPartition(() => _)
-      }
-    }
-    sparkThread.setUncaughtExceptionHandler(tracker)
-    sparkThread.start()
-    // should fail due to connection timeout
-    assert(tracker.waitFor(0L) == TrackerStatus.FAILURE.getStatusCode)
-  }
-
   test("should allow the dataframe containing communicator calls to be partially evaluated for" +
     " multiple times (ISSUE-4406)") {
     val paramMap = Map(
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTracker.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTracker.scala
deleted file mode 100644
index fb388d083..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTracker.scala
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit
-
-import java.net.{InetAddress, InetSocketAddress}
-
-import akka.actor.ActorSystem
-import akka.pattern.ask
-import ml.dmlc.xgboost4j.java.{IRabitTracker, TrackerProperties}
-import ml.dmlc.xgboost4j.scala.rabit.handler.RabitTrackerHandler
-
-import scala.concurrent.duration._
-import scala.concurrent.{Await, Future}
-import scala.util.{Failure, Success, Try}
-
-/**
-  * Scala implementation of the Rabit tracker interface without Python dependency.
-  * The Scala Rabit tracker fully implements the timeout logic, effectively preventing the tracker
-  * (and thus any distributed tasks) to hang indefinitely due to network issues or worker node
-  * failures.
-  *
-  * Note that this implementation is currently experimental, and should be used at your own risk.
-  *
-  * Example usage:
-  * {{{
-  *   import scala.concurrent.duration._
-  *
-  *   val tracker = new RabitTracker(32)
-  *   // allow up to 10 minutes for all workers to connect to the tracker.
-  *   tracker.start(10 minutes)
-  *
-  *   /* ...
-  *      launching workers in parallel
-  *      ...
-  *   */
-  *
-  *   // wait for worker execution up to 6 hours.
-  *   // providing a finite timeout prevents a long-running task from hanging forever in
-  *   // catastrophic events, like the loss of an executor during model training.
-  *   tracker.waitFor(6 hours)
-  * }}}
-  *
-  * @param numWorkers Number of distributed workers from which the tracker expects connections.
-  * @param port The minimum port number that the tracker binds to.
-  *             If port is omitted, or given as None, a random ephemeral port is chosen at runtime.
-  * @param maxPortTrials The maximum number of trials of socket binding, by sequentially
-  *                      increasing the port number.
-  */
-private[scala] class RabitTracker(numWorkers: Int, port: Option[Int] = None,
-                                  maxPortTrials: Int = 1000)
-  extends IRabitTracker {
-
-  import scala.collection.JavaConverters._
-
-  require(numWorkers >=1, "numWorkers must be greater than or equal to one (1).")
-
-  val system = ActorSystem.create("RabitTracker")
-  val handler = system.actorOf(RabitTrackerHandler.props(numWorkers), "Handler")
-  implicit val askTimeout: akka.util.Timeout = akka.util.Timeout(30 seconds)
-  private[this] val tcpBindingTimeout: Duration = 1 minute
-
-  var workerEnvs: Map[String, String] = Map.empty
-
-  override def uncaughtException(t: Thread, e: Throwable): Unit = {
-    handler ? RabitTrackerHandler.InterruptTracker(e)
-  }
-
-  /**
-    * Start the Rabit tracker.
-    *
-    * @param timeout The timeout for awaiting connections from worker nodes.
-    *      Note that when used in Spark applications, because all Spark transformations are
-    *      lazily executed, the I/O time for loading RDDs/DataFrames from external sources
-    *      (local dist, HDFS, S3 etc.) must be taken into account for the timeout value.
-    *      If the timeout value is too small, the Rabit tracker will likely timeout before workers
-    *      establishing connections to the tracker, due to the overhead of loading data.
-    *      Using a finite timeout is encouraged, as it prevents the tracker (thus the Spark driver
-    *      running it) from hanging indefinitely due to worker connection issues (e.g. firewall.)
-    * @return Boolean flag indicating if the Rabit tracker starts successfully.
-    */
-  private def start(timeout: Duration): Boolean = {
-    val hostAddress = Option(TrackerProperties.getInstance().getHostIp)
-      .map(InetAddress.getByName).getOrElse(InetAddress.getLocalHost)
-
-    handler ? RabitTrackerHandler.StartTracker(
-      new InetSocketAddress(hostAddress, port.getOrElse(0)), maxPortTrials, timeout)
-
-    // block by waiting for the actor to bind to a port
-    Try(Await.result(handler ? RabitTrackerHandler.RequestBoundFuture, askTimeout.duration)
-      .asInstanceOf[Future[Map[String, String]]]) match {
-      case Success(futurePortBound) =>
-        // The success of the Future is contingent on binding to an InetSocketAddress.
-        val isBound = Try(Await.ready(futurePortBound, tcpBindingTimeout)).isSuccess
-        if (isBound) {
-          workerEnvs = Await.result(futurePortBound, 0 nano)
-        }
-        isBound
-      case Failure(ex: Throwable) =>
-        false
-    }
-  }
-
-  /**
-    * Start the Rabit tracker.
-    *
-    * @param connectionTimeoutMillis Timeout, in milliseconds, for the tracker to wait for worker
-    *                                connections. If a non-positive value is provided, the tracker
-    *                                waits for incoming worker connections indefinitely.
-    * @return Boolean flag indicating if the Rabit tracker starts successfully.
-    */
-  def start(connectionTimeoutMillis: Long): Boolean = {
-    if (connectionTimeoutMillis <= 0) {
-      start(Duration.Inf)
-    } else {
-      start(Duration.fromNanos(connectionTimeoutMillis * 1e6))
-    }
-  }
-
-  def stop(): Unit = {
-    system.terminate()
-  }
-
-  /**
-    * Get a Map of necessary environment variables to initiate Rabit workers.
-    *
-    * @return HashMap containing tracker information.
-    */
-  def getWorkerEnvs: java.util.Map[String, String] = {
-    new java.util.HashMap((workerEnvs ++ Map(
-        "DMLC_NUM_WORKER" -> numWorkers.toString,
-        "DMLC_NUM_SERVER" -> "0"
-    )).asJava)
-  }
-
-  /**
-    * Await workers to complete assigned tasks for at most 'atMostMillis' milliseconds.
-    * This method blocks until timeout or task completion.
-    *
-    * @param atMost the maximum execution time for the workers. By default,
-    *     the tracker waits for the workers indefinitely.
-    * @return 0 if the tasks complete successfully, and non-zero otherwise.
-    */
-  private def waitFor(atMost: Duration): Int = {
-    // request the completion Future from the tracker actor
-    Try(Await.result(handler ? RabitTrackerHandler.RequestCompletionFuture, askTimeout.duration)
-      .asInstanceOf[Future[Int]]) match {
-      case Success(futureCompleted) =>
-        // wait for all workers to complete synchronously.
-        val statusCode = Try(Await.result(futureCompleted, atMost)) match {
-          case Success(n) if n == numWorkers =>
-            IRabitTracker.TrackerStatus.SUCCESS.getStatusCode
-          case Success(n) if n < numWorkers =>
-            IRabitTracker.TrackerStatus.TIMEOUT.getStatusCode
-          case Failure(e) =>
-            IRabitTracker.TrackerStatus.FAILURE.getStatusCode
-        }
-        system.terminate()
-        statusCode
-      case Failure(ex: Throwable) =>
-        system.terminate()
-        IRabitTracker.TrackerStatus.FAILURE.getStatusCode
-    }
-  }
-
-  /**
-    * Await workers to complete assigned tasks for at most 'atMostMillis' milliseconds.
-    * This method blocks until timeout or task completion.
-    *
-    * @param atMostMillis Number of milliseconds for the tracker to wait for workers. If a
-    *                     non-positive number is given, the tracker waits indefinitely.
-    * @return 0 if the tasks complete successfully, and non-zero otherwise
-    */
-  def waitFor(atMostMillis: Long): Int = {
-    if (atMostMillis <= 0) {
-      waitFor(Duration.Inf)
-    } else {
-      waitFor(Duration.fromNanos(atMostMillis * 1e6))
-    }
-  }
-}
-
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala
deleted file mode 100644
index f9de71746..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitTrackerHandler.scala
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit.handler
-
-import java.net.InetSocketAddress
-import java.util.UUID
-
-import scala.concurrent.duration._
-import scala.collection.mutable
-import scala.concurrent.{Promise, TimeoutException}
-import akka.io.{IO, Tcp}
-import akka.actor._
-import ml.dmlc.xgboost4j.java.XGBoostError
-import ml.dmlc.xgboost4j.scala.rabit.util.{AssignedRank, LinkMap}
-
-import scala.util.{Failure, Random, Success, Try}
-
-/** The Akka actor for handling and coordinating Rabit worker connections.
-  * This is the main actor for handling socket connections, interacting with the synchronous
-  * tracker interface, and resolving tree/ring/parent dependencies between workers.
-  *
-  * @param numWorkers Number of workers to track.
-  */
-private[scala] class RabitTrackerHandler(numWorkers: Int)
-  extends Actor with ActorLogging {
-
-  import context.system
-  import RabitWorkerHandler._
-  import RabitTrackerHandler._
-
-  private[this] val promisedWorkerEnvs = Promise[Map[String, String]]()
-  private[this] val promisedShutdownWorkers = Promise[Int]()
-  private[this] val tcpManager = IO(Tcp)
-
-  // resolves worker connection dependency.
-  val resolver = context.actorOf(Props(classOf[WorkerDependencyResolver], self), "Resolver")
-
-  // workers that have sent "shutdown" signal
-  private[this] val shutdownWorkers = mutable.Set.empty[Int]
-  private[this] val jobToRankMap = mutable.HashMap.empty[String, Int]
-  private[this] val actorRefToHost = mutable.HashMap.empty[ActorRef, String]
-  private[this] val ranksToAssign = mutable.ListBuffer(0 until numWorkers: _*)
-  private[this] var maxPortTrials = 0
-  private[this] var workerConnectionTimeout: Duration = Duration.Inf
-  private[this] var portTrials = 0
-  private[this] val startedWorkers = mutable.Set.empty[Int]
-
-  val linkMap = new LinkMap(numWorkers)
-
-  def decideRank(rank: Int, jobId: String = "NULL"): Option[Int] = {
-    rank match {
-      case r if r >= 0 => Some(r)
-      case _ =>
-        jobId match {
-          case "NULL" => None
-          case jid => jobToRankMap.get(jid)
-        }
-    }
-  }
-
-  /**
-    * Handler for all Akka Tcp connection/binding events. Read/write over the socket is handled
-    * by the RabitWorkerHandler.
-    *
-    * @param event Generic Tcp.Event
-    */
-  private def handleTcpEvents(event: Tcp.Event): Unit = event match {
-    case Tcp.Bound(local) =>
-      // expect all workers to connect within timeout
-      log.info(s"Tracker listening @ ${local.getAddress.getHostAddress}:${local.getPort}")
-      log.info(s"Worker connection timeout is $workerConnectionTimeout.")
-
-      context.setReceiveTimeout(workerConnectionTimeout)
-      promisedWorkerEnvs.success(Map(
-        "DMLC_TRACKER_URI" -> local.getAddress.getHostAddress,
-        "DMLC_TRACKER_PORT" -> local.getPort.toString,
-        // not required because the world size will be communicated to the
-        // worker node after the rank is assigned.
-        "rabit_world_size" -> numWorkers.toString
-      ))
-
-    case Tcp.CommandFailed(cmd: Tcp.Bind) =>
-      if (portTrials < maxPortTrials) {
-        portTrials += 1
-        tcpManager ! Tcp.Bind(self,
-          new InetSocketAddress(cmd.localAddress.getAddress, cmd.localAddress.getPort + 1),
-          backlog = 256)
-      }
-
-    case Tcp.Connected(remote, local) =>
-      log.debug(s"Incoming connection from worker @ ${remote.getAddress.getHostAddress}")
-      // revoke timeout if all workers have connected.
-      val workerHandler = context.actorOf(RabitWorkerHandler.props(
-        remote.getAddress.getHostAddress, numWorkers, self, sender()
-      ), s"ConnectionHandler-${UUID.randomUUID().toString}")
-      val connection = sender()
-      connection ! Tcp.Register(workerHandler, keepOpenOnPeerClosed = true)
-
-      actorRefToHost.put(workerHandler, remote.getAddress.getHostName)
-  }
-
-  /**
-    * Handles external tracker control messages sent by RabitTracker (usually in ask patterns)
-    * to interact with the tracker interface.
-    *
-    * @param trackerMsg control messages sent by RabitTracker class.
-    */
-  private def handleTrackerControlMessage(trackerMsg: TrackerControlMessage): Unit =
-    trackerMsg match {
-
-    case msg: StartTracker =>
-      maxPortTrials = msg.maxPortTrials
-      workerConnectionTimeout = msg.connectionTimeout
-
-      // if the port number is missing, try binding to a random ephemeral port.
-      if (msg.addr.getPort == 0) {
-        tcpManager ! Tcp.Bind(self,
-          new InetSocketAddress(msg.addr.getAddress, new Random().nextInt(61000 - 32768) + 32768),
-          backlog = 256)
-      } else {
-        tcpManager ! Tcp.Bind(self, msg.addr, backlog = 256)
-      }
-      sender() ! true
-
-    case RequestBoundFuture =>
-      sender() ! promisedWorkerEnvs.future
-
-    case RequestCompletionFuture =>
-      sender() ! promisedShutdownWorkers.future
-
-    case InterruptTracker(e) =>
-      log.error(e, "Uncaught exception thrown by worker.")
-      // make sure that waitFor() does not hang indefinitely.
-      promisedShutdownWorkers.failure(e)
-      context.stop(self)
-  }
-
-  /**
-    * Handles messages sent by child actors representing connecting Rabit workers, by brokering
-    * messages to the dependency resolver, and processing worker commands.
-    *
-    * @param workerMsg Message sent by RabitWorkerHandler actors.
-    */
-  private def handleRabitWorkerMessage(workerMsg: RabitWorkerRequest): Unit = workerMsg match {
-    case req @ RequestAwaitConnWorkers(_, _) =>
-      // since the requester may request to connect to other workers
-      // that have not fully set up, delegate this request to the
-      // dependency resolver which handles the dependencies properly.
-      resolver forward req
-
-    // ---- Rabit worker commands: start/recover/shutdown/print ----
-    case WorkerTrackerPrint(_, _, _, msg) =>
-      log.info(msg.trim)
-
-    case WorkerShutdown(rank, _, _) =>
-      assert(rank >= 0, "Invalid rank.")
-      assert(!shutdownWorkers.contains(rank))
-      shutdownWorkers.add(rank)
-
-      log.info(s"Received shutdown signal from $rank")
-
-      if (shutdownWorkers.size == numWorkers) {
-        promisedShutdownWorkers.success(shutdownWorkers.size)
-      }
-
-    case WorkerRecover(prevRank, worldSize, jobId) =>
-      assert(prevRank >= 0)
-      sender() ! linkMap.assignRank(prevRank)
-
-    case WorkerStart(rank, worldSize, jobId) =>
-      assert(worldSize == numWorkers || worldSize == -1,
-        s"Purported worldSize ($worldSize) does not match worker count ($numWorkers)."
-      )
-
-      Try(decideRank(rank, jobId).getOrElse(ranksToAssign.remove(0))) match {
-        case Success(wkRank) =>
-          if (jobId != "NULL") {
-            jobToRankMap.put(jobId, wkRank)
-          }
-
-          val assignedRank = linkMap.assignRank(wkRank)
-          sender() ! assignedRank
-          resolver ! assignedRank
-
-          log.info("Received start signal from " +
-            s"${actorRefToHost.getOrElse(sender(), "")} [rank: $wkRank]")
-
-        case Failure(ex: IndexOutOfBoundsException) =>
-          // More than worldSize workers have connected, likely due to executor loss.
-          // Since Rabit currently does not support crash recovery (because the Allreduce results
-          // are not cached by the tracker, and because existing workers cannot reestablish
-          // connections to newly spawned executor/worker), the most reasonble action here is to
-          // interrupt the tracker immediate with failure state.
-          log.error("Received invalid start signal from " +
-            s"${actorRefToHost.getOrElse(sender(), "")}: all $worldSize workers have started."
-          )
-          promisedShutdownWorkers.failure(new XGBoostError("Invalid start signal" +
-            " received from worker, likely due to executor loss."))
-
-        case Failure(ex) =>
-          log.error(ex, "Unexpected error")
-          promisedShutdownWorkers.failure(ex)
-      }
-
-
-    // ---- Dependency resolving related messages ----
-    case msg @ WorkerStarted(host, rank, awaitingAcceptance) =>
-      log.info(s"Worker $host (rank: $rank) has started.")
-      resolver forward msg
-
-      startedWorkers.add(rank)
-      if (startedWorkers.size == numWorkers) {
-        log.info("All workers have started.")
-      }
-
-    case req @ DropFromWaitingList(_) =>
-      // all peer workers in dependency link map have connected;
-      // forward message to resolver to update dependencies.
-      resolver forward req
-
-    case _ =>
-  }
-
-  def receive: Actor.Receive = {
-    case tcpEvent: Tcp.Event => handleTcpEvents(tcpEvent)
-    case trackerMsg: TrackerControlMessage => handleTrackerControlMessage(trackerMsg)
-    case workerMsg: RabitWorkerRequest => handleRabitWorkerMessage(workerMsg)
-
-    case akka.actor.ReceiveTimeout =>
-      if (startedWorkers.size < numWorkers) {
-        promisedShutdownWorkers.failure(
-          new TimeoutException("Timed out waiting for workers to connect: " +
-            s"${numWorkers - startedWorkers.size} of $numWorkers did not start/connect.")
-        )
-        context.stop(self)
-      }
-
-      context.setReceiveTimeout(Duration.Undefined)
-  }
-}
-
-/**
-  * Resolve the dependency between nodes as they connect to the tracker.
-  * The dependency is enforced that a worker of rank K depends on its neighbors (from the treeMap
-  * and ringMap) whose ranks are smaller than K. Since ranks are assigned in the order of
-  * connections by workers, this dependency constraint assumes that a worker node connects first
-  * is likely to finish setup first.
-  */
-private[rabit] class WorkerDependencyResolver(handler: ActorRef) extends Actor with ActorLogging {
-  import RabitWorkerHandler._
-
-  context.watch(handler)
-
-  case class Fulfillment(toConnectSet: Set[Int], promise: Promise[AwaitingConnections])
-
-  // worker nodes that have connected, but have not send WorkerStarted message.
-  private val dependencyMap = mutable.Map.empty[Int, Set[Int]]
-  private val startedWorkers = mutable.Set.empty[Int]
-  // worker nodes that have started, and await for connections.
-  private val awaitConnWorkers = mutable.Map.empty[Int, ActorRef]
-  private val pendingFulfillment = mutable.Map.empty[Int, Fulfillment]
-
-  def awaitingWorkers(linkSet: Set[Int]): AwaitingConnections = {
-    val connSet = awaitConnWorkers.toMap
-      .filterKeys(k => linkSet.contains(k))
-    AwaitingConnections(connSet, linkSet.size - connSet.size)
-  }
-
-  def receive: Actor.Receive = {
-    // a copy of the AssignedRank message that is also sent to the worker
-    case AssignedRank(rank, tree_neighbors, ring, parent) =>
-      // the workers that the worker of given `rank` depends on:
-      // worker of rank K only depends on workers with rank smaller than K.
-      val dependentWorkers = (tree_neighbors.toSet ++ Set(ring._1, ring._2))
-        .filter{ r => r != -1 && r < rank}
-
-      log.debug(s"Rank $rank connected, dependencies: $dependentWorkers")
-      dependencyMap.put(rank, dependentWorkers)
-
-    case RequestAwaitConnWorkers(rank, toConnectSet) =>
-      val promise = Promise[AwaitingConnections]()
-
-      assert(dependencyMap.contains(rank))
-
-      val updatedDependency = dependencyMap(rank) diff startedWorkers
-      if (updatedDependency.isEmpty) {
-        // all dependencies are satisfied
-        log.debug(s"Rank $rank has all dependencies satisfied.")
-        promise.success(awaitingWorkers(toConnectSet))
-      } else {
-        log.debug(s"Rank $rank's request for AwaitConnWorkers is pending fulfillment.")
-        // promise is pending fulfillment due to unresolved dependency
-        pendingFulfillment.put(rank, Fulfillment(toConnectSet, promise))
-      }
-
-      sender() ! promise.future
-
-    case WorkerStarted(_, started, awaitingAcceptance) =>
-      startedWorkers.add(started)
-      if (awaitingAcceptance > 0) {
-        awaitConnWorkers.put(started, sender())
-      }
-
-      // remove the started rank from all dependencies.
-      dependencyMap.remove(started)
-      dependencyMap.foreach { case (r, dset) =>
-        val updatedDependency = dset diff startedWorkers
-        // fulfill the future if all dependencies are met (started.)
-        if (updatedDependency.isEmpty) {
-          log.debug(s"Rank $r has all dependencies satisfied.")
-          pendingFulfillment.remove(r).map{
-            case Fulfillment(toConnectSet, promise) =>
-              promise.success(awaitingWorkers(toConnectSet))
-          }
-        }
-
-        dependencyMap.update(r, updatedDependency)
-      }
-
-    case DropFromWaitingList(rank) =>
-      assert(awaitConnWorkers.remove(rank).isDefined)
-
-    case Terminated(ref) =>
-      if (ref.equals(handler)) {
-        context.stop(self)
-      }
-  }
-}
-
-private[scala] object RabitTrackerHandler {
-  // Messages sent by RabitTracker to this RabitTrackerHandler actor
-  trait TrackerControlMessage
-  case object RequestCompletionFuture extends TrackerControlMessage
-  case object RequestBoundFuture extends TrackerControlMessage
-  // Start the Rabit tracker at given socket address awaiting worker connections.
-  // All workers must connect to the tracker before connectionTimeout, otherwise the tracker will
-  // shut down due to timeout.
-  case class StartTracker(addr: InetSocketAddress,
-                          maxPortTrials: Int,
-                          connectionTimeout: Duration) extends TrackerControlMessage
-  // To interrupt the tracker handler due to uncaught exception thrown by the thread acting as
-  // driver for the distributed training.
-  case class InterruptTracker(e: Throwable) extends TrackerControlMessage
-
-  def props(numWorkers: Int): Props =
-    Props(new RabitTrackerHandler(numWorkers))
-}
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitWorkerHandler.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitWorkerHandler.scala
deleted file mode 100644
index 234c4d25a..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/handler/RabitWorkerHandler.scala
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit.handler
-
-import java.nio.{ByteBuffer, ByteOrder}
-
-import akka.io.Tcp
-import akka.actor._
-import akka.util.ByteString
-import ml.dmlc.xgboost4j.scala.rabit.util.{AssignedRank, RabitTrackerHelpers}
-
-import scala.concurrent.{Await, Future}
-import scala.concurrent.duration._
-import scala.util.Try
-
-/**
-  * Actor to handle socket communication from worker node.
-  * To handle fragmentation in received data, this class acts like a FSM
-  * (finite-state machine) to keep track of the internal states.
-  *
-  * @param host IP address of the remote worker
-  * @param worldSize number of total workers
-  * @param tracker the RabitTrackerHandler actor reference
-  */
-private[scala] class RabitWorkerHandler(host: String, worldSize: Int, tracker: ActorRef,
-                                        connection: ActorRef)
-  extends FSM[RabitWorkerHandler.State, RabitWorkerHandler.DataStruct]
-    with ActorLogging with Stash {
-
-  import RabitWorkerHandler._
-  import RabitTrackerHelpers._
-
-  private[this] var rank: Int = 0
-  private[this] var port: Int = 0
-
-  // indicate if the connection is transient (like "print" or "shutdown")
-  private[this] var transient: Boolean = false
-  private[this] var peerClosed: Boolean = false
-
-  // number of workers pending acceptance of current worker
-  private[this] var awaitingAcceptance: Int = 0
-  private[this] var neighboringWorkers = Set.empty[Int]
-
-  // TODO: use a single memory allocation to host all buffers,
-  // including the transient ones for writing.
-  private[this] val readBuffer = ByteBuffer.allocate(4096)
-    .order(ByteOrder.nativeOrder())
-  // in case the received message is longer than needed,
-  // stash the spilled over part in this buffer, and send
-  // to self when transition occurs.
-  private[this] val spillOverBuffer = ByteBuffer.allocate(4096)
-    .order(ByteOrder.nativeOrder())
-  // when setup is complete, need to notify peer handlers
-  // to reduce the awaiting-connection counter.
-  private[this] var pendingAcknowledgement: Option[AcknowledgeAcceptance] = None
-
-  private def resetBuffers(): Unit = {
-    readBuffer.clear()
-    if (spillOverBuffer.position() > 0) {
-      spillOverBuffer.flip()
-      self ! Tcp.Received(ByteString.fromByteBuffer(spillOverBuffer))
-      spillOverBuffer.clear()
-    }
-  }
-
-  private def stashSpillOver(buf: ByteBuffer): Unit = {
-    if (buf.remaining() > 0) spillOverBuffer.put(buf)
-  }
-
-  def getNeighboringWorkers: Set[Int] = neighboringWorkers
-
-  def decodeCommand(buffer: ByteBuffer): TrackerCommand = {
-    val readBuffer = buffer.duplicate().order(ByteOrder.nativeOrder())
-    readBuffer.flip()
-
-    val rank = readBuffer.getInt()
-    val worldSize = readBuffer.getInt()
-    val jobId = readBuffer.getString
-
-    val command = readBuffer.getString
-    val trackerCommand = command match {
-      case "start" => WorkerStart(rank, worldSize, jobId)
-      case "shutdown" =>
-        transient = true
-        WorkerShutdown(rank, worldSize, jobId)
-      case "recover" =>
-        require(rank >= 0, "Invalid rank for recovering worker.")
-        WorkerRecover(rank, worldSize, jobId)
-      case "print" =>
-        transient = true
-        WorkerTrackerPrint(rank, worldSize, jobId, readBuffer.getString)
-    }
-
-    stashSpillOver(readBuffer)
-    trackerCommand
-  }
-
-  startWith(AwaitingHandshake, DataStruct())
-
-  when(AwaitingHandshake) {
-    case Event(Tcp.Received(magic), _) =>
-      assert(magic.length == 4)
-      val purportedMagic = magic.asNativeOrderByteBuffer.getInt
-      assert(purportedMagic == MAGIC_NUMBER, s"invalid magic number $purportedMagic from $host")
-
-      // echo back the magic number
-      connection ! Tcp.Write(magic)
-      goto(AwaitingCommand) using StructTrackerCommand
-  }
-
-  when(AwaitingCommand) {
-    case Event(Tcp.Received(bytes), validator) =>
-      bytes.asByteBuffers.foreach { buf => readBuffer.put(buf) }
-      if (validator.verify(readBuffer)) {
-        Try(decodeCommand(readBuffer)) match {
-          case scala.util.Success(decodedCommand) =>
-            tracker ! decodedCommand
-          case scala.util.Failure(th: java.nio.BufferUnderflowException) =>
-            // BufferUnderflowException would occur if the message to print has not arrived yet.
-            // Do nothing, wait for next Tcp.Received event
-          case scala.util.Failure(th: Throwable) => throw th
-        }
-      }
-
-      stay
-    // when rank for a worker is assigned, send encoded rank information
-    // back to worker over Tcp socket.
-    case Event(aRank @ AssignedRank(assignedRank, neighbors, ring, parent), _) =>
-      log.debug(s"Assigned rank [$assignedRank] for $host, T: $neighbors, R: $ring, P: $parent")
-
-      rank = assignedRank
-      // ranks from the ring
-      val ringRanks = List(
-        // ringPrev
-        if (ring._1 != -1 && ring._1 != rank) ring._1 else -1,
-        // ringNext
-        if (ring._2 != -1 && ring._2 != rank) ring._2 else -1
-      )
-
-      // update the set of all linked workers to current worker.
-      neighboringWorkers = neighbors.toSet ++ ringRanks.filterNot(_ == -1).toSet
-
-      connection ! Tcp.Write(ByteString.fromByteBuffer(aRank.toByteBuffer(worldSize)))
-      // to prevent reading before state transition
-      connection ! Tcp.SuspendReading
-      goto(BuildingLinkMap) using StructNodes
-  }
-
-  when(BuildingLinkMap) {
-    case Event(Tcp.Received(bytes), validator) =>
-      bytes.asByteBuffers.foreach { buf =>
-        readBuffer.put(buf)
-      }
-
-      if (validator.verify(readBuffer)) {
-        readBuffer.flip()
-        // for a freshly started worker, numConnected should be 0.
-        val numConnected = readBuffer.getInt()
-        val toConnectSet = neighboringWorkers.diff(
-          (0 until numConnected).map { index => readBuffer.getInt() }.toSet)
-
-        // check which workers are currently awaiting connections
-        tracker ! RequestAwaitConnWorkers(rank, toConnectSet)
-      }
-      stay
-
-    // got a Future from the tracker (resolver) about workers that are
-    // currently awaiting connections (particularly from this node.)
-    case Event(future: Future[_], _) =>
-      // blocks execution until all dependencies for current worker is resolved.
-      Await.result(future, 1 minute).asInstanceOf[AwaitingConnections] match {
-        // numNotReachable is the number of workers that currently
-        // cannot be connected to (pending connection or setup). Instead, this worker will AWAIT
-        // connections from those currently non-reachable nodes in the future.
-        case AwaitingConnections(waitConnNodes, numNotReachable) =>
-          log.debug(s"Rank $rank needs to connect to: $waitConnNodes, # bad: $numNotReachable")
-          val buf = ByteBuffer.allocate(8).order(ByteOrder.nativeOrder())
-          buf.putInt(waitConnNodes.size).putInt(numNotReachable)
-          buf.flip()
-
-          // cache this message until the final state (SetupComplete)
-          pendingAcknowledgement = Some(AcknowledgeAcceptance(
-            waitConnNodes, numNotReachable))
-
-          connection ! Tcp.Write(ByteString.fromByteBuffer(buf))
-          if (waitConnNodes.isEmpty) {
-            connection ! Tcp.SuspendReading
-            goto(AwaitingErrorCount)
-          }
-          else {
-            waitConnNodes.foreach { case (peerRank, peerRef) =>
-              peerRef ! RequestWorkerHostPort
-            }
-
-            // a countdown for DivulgedHostPort messages.
-            stay using DataStruct(Seq.empty[DataField], waitConnNodes.size - 1)
-          }
-      }
-
-    case Event(DivulgedWorkerHostPort(peerRank, peerHost, peerPort), data) =>
-      val hostBytes = peerHost.getBytes()
-      val buffer = ByteBuffer.allocate(4 * 3 + hostBytes.length)
-        .order(ByteOrder.nativeOrder())
-      buffer.putInt(peerHost.length).put(hostBytes)
-        .putInt(peerPort).putInt(peerRank)
-
-      buffer.flip()
-      connection ! Tcp.Write(ByteString.fromByteBuffer(buffer))
-
-      if (data.counter == 0) {
-        // to prevent reading before state transition
-        connection ! Tcp.SuspendReading
-        goto(AwaitingErrorCount)
-      }
-      else {
-        stay using data.decrement()
-      }
-  }
-
-  when(AwaitingErrorCount) {
-    case Event(Tcp.Received(numErrors), _) =>
-      val buf = numErrors.asNativeOrderByteBuffer
-
-      buf.getInt match {
-        case 0 =>
-          stashSpillOver(buf)
-          goto(AwaitingPortNumber)
-        case _ =>
-          stashSpillOver(buf)
-          goto(BuildingLinkMap) using StructNodes
-      }
-  }
-
-  when(AwaitingPortNumber) {
-    case Event(Tcp.Received(assignedPort), _) =>
-      assert(assignedPort.length == 4)
-      port = assignedPort.asNativeOrderByteBuffer.getInt
-      log.debug(s"Rank $rank listening @ $host:$port")
-      // wait until the worker closes connection.
-      if (peerClosed) goto(SetupComplete) else stay
-
-    case Event(Tcp.PeerClosed, _) =>
-      peerClosed = true
-      if (port == 0) stay else goto(SetupComplete)
-  }
-
-  when(SetupComplete) {
-    case Event(ReduceWaitCount(count: Int), _) =>
-      awaitingAcceptance -= count
-      // check peerClosed to avoid prematurely stopping this actor (which sends RST to worker)
-      if (awaitingAcceptance == 0 && peerClosed) {
-        tracker ! DropFromWaitingList(rank)
-        // no longer needed.
-        context.stop(self)
-      }
-      stay
-
-    case Event(AcknowledgeAcceptance(peers, numBad), _) =>
-      awaitingAcceptance = numBad
-      tracker ! WorkerStarted(host, rank, awaitingAcceptance)
-      peers.values.foreach { peer =>
-        peer ! ReduceWaitCount(1)
-      }
-
-      if (awaitingAcceptance == 0 && peerClosed) self ! PoisonPill
-
-      stay
-
-    // can only divulge the complete host and port information
-    // when this worker is declared fully connected (otherwise
-    // port information is still missing.)
-    case Event(RequestWorkerHostPort, _) =>
-      sender() ! DivulgedWorkerHostPort(rank, host, port)
-      stay
-  }
-
-  onTransition {
-    // reset buffer when state transitions as data becomes stale
-    case _ -> SetupComplete =>
-      connection ! Tcp.ResumeReading
-      resetBuffers()
-      if (pendingAcknowledgement.isDefined) {
-        self ! pendingAcknowledgement.get
-      }
-    case _ =>
-      connection ! Tcp.ResumeReading
-      resetBuffers()
-  }
-
-  // default message handler
-  whenUnhandled {
-    case Event(Tcp.PeerClosed, _) =>
-      peerClosed = true
-      if (transient) context.stop(self)
-      stay
-  }
-}
-
-private[scala] object RabitWorkerHandler {
-  val MAGIC_NUMBER = 0xff99
-
-  // Finite states of this actor, which acts like a FSM.
-  // The following states are defined in order as the FSM progresses.
-  sealed trait State
-
-  // [1] Initial state, awaiting worker to send magic number per protocol.
-  case object AwaitingHandshake extends State
-  // [2] Awaiting worker to send command (start/print/recover/shutdown etc.)
-  case object AwaitingCommand extends State
-  // [3] Brokers connections between workers per ring/tree/parent link map.
-  case object BuildingLinkMap extends State
-  // [4] A transient state in which the worker reports the number of errors in establishing
-  // connections to other peer workers. If no errors, transition to next state.
-  case object AwaitingErrorCount extends State
-  // [5] Awaiting the worker to report its port number for accepting connections from peer workers.
-  // This port number information is later forwarded to linked workers.
-  case object AwaitingPortNumber extends State
-  // [6] Final state after completing the setup with the connecting worker. At this stage, the
-  // worker will have closed the Tcp connection. The actor remains alive to handle messages from
-  // peer actors representing workers with pending setups.
-  case object SetupComplete extends State
-
-  sealed trait DataField
-  case object IntField extends DataField
-  // an integer preceding the actual string
-  case object StringField extends DataField
-  case object IntSeqField extends DataField
-
-  object DataStruct {
-    def apply(): DataStruct = DataStruct(Seq.empty[DataField], 0)
-  }
-
-  // Internal data pertaining to individual state, used to verify the validity of packets sent by
-  // workers.
-  case class DataStruct(fields: Seq[DataField], counter: Int) {
-    /**
-      * Validate whether the provided buffer is complete (i.e., contains
-      * all data fields specified for this DataStruct.)
- *
-      * @param buf a byte buffer containing received data.
-      */
-    def verify(buf: ByteBuffer): Boolean = {
-      if (fields.isEmpty) return true
-
-      val dupBuf = buf.duplicate().order(ByteOrder.nativeOrder())
-      dupBuf.flip()
-
-      Try(fields.foldLeft(true) {
-        case (complete, field) =>
-          val remBytes = dupBuf.remaining()
-          complete && (remBytes > 0) && (remBytes >= (field match {
-            case IntField =>
-              dupBuf.position(dupBuf.position() + 4)
-              4
-            case StringField =>
-              val strLen = dupBuf.getInt
-              dupBuf.position(dupBuf.position() + strLen)
-              4 + strLen
-            case IntSeqField =>
-              val seqLen = dupBuf.getInt
-              dupBuf.position(dupBuf.position() + seqLen * 4)
-              4 + seqLen * 4
-          }))
-      }).getOrElse(false)
-    }
-
-    def increment(): DataStruct = DataStruct(fields, counter + 1)
-    def decrement(): DataStruct = DataStruct(fields, counter - 1)
-  }
-
-  val StructNodes = DataStruct(List(IntSeqField), 0)
-  val StructTrackerCommand = DataStruct(List(
-    IntField, IntField, StringField, StringField
-  ), 0)
-
-  // ---- Messages between RabitTrackerHandler and RabitTrackerConnectionHandler ----
-
-  // RabitWorkerHandler --> RabitTrackerHandler
-  sealed trait RabitWorkerRequest
-  // RabitWorkerHandler <-- RabitTrackerHandler
-  sealed trait RabitWorkerResponse
-
-  // Representations of decoded worker commands.
-  abstract class TrackerCommand(val command: String) extends RabitWorkerRequest {
-    def rank: Int
-    def worldSize: Int
-    def jobId: String
-
-    def encode: ByteString = {
-      val buf = ByteBuffer.allocate(4 * 4 + jobId.length + command.length)
-        .order(ByteOrder.nativeOrder())
-
-      buf.putInt(rank).putInt(worldSize).putInt(jobId.length).put(jobId.getBytes())
-        .putInt(command.length).put(command.getBytes()).flip()
-
-      ByteString.fromByteBuffer(buf)
-    }
-  }
-
-  case class WorkerStart(rank: Int, worldSize: Int, jobId: String)
-    extends TrackerCommand("start")
-  case class WorkerShutdown(rank: Int, worldSize: Int, jobId: String)
-    extends TrackerCommand("shutdown")
-  case class WorkerRecover(rank: Int, worldSize: Int, jobId: String)
-    extends TrackerCommand("recover")
-  case class WorkerTrackerPrint(rank: Int, worldSize: Int, jobId: String, msg: String)
-    extends TrackerCommand("print") {
-
-    override def encode: ByteString = {
-      val buf = ByteBuffer.allocate(4 * 5 + jobId.length + command.length + msg.length)
-        .order(ByteOrder.nativeOrder())
-
-      buf.putInt(rank).putInt(worldSize).putInt(jobId.length).put(jobId.getBytes())
-        .putInt(command.length).put(command.getBytes())
-        .putInt(msg.length).put(msg.getBytes()).flip()
-
-      ByteString.fromByteBuffer(buf)
-    }
-  }
-
-  // Request to remove the worker of given rank from the list of workers awaiting peer connections.
-  case class DropFromWaitingList(rank: Int) extends RabitWorkerRequest
-  // Notify the tracker that the worker of given rank has finished setup and started.
-  case class WorkerStarted(host: String, rank: Int, awaitingAcceptance: Int)
-    extends RabitWorkerRequest
-  // Request the set of workers to connect to, according to the LinkMap structure.
-  case class RequestAwaitConnWorkers(rank: Int, toConnectSet: Set[Int])
-    extends RabitWorkerRequest
-
-  // Request, from the tracker, the set of nodes to connect.
-  case class AwaitingConnections(workers: Map[Int, ActorRef], numBad: Int)
-    extends RabitWorkerResponse
-
-  // ---- Messages between ConnectionHandler actors ----
-  sealed trait IntraWorkerMessage
-
-  // Notify neighboring workers to decrease the counter of awaiting workers by `count`.
-  case class ReduceWaitCount(count: Int) extends IntraWorkerMessage
-  // Request host and port information from peer ConnectionHandler actors (acting on behave of
-  // connecting workers.) This message will be brokered by RabitTrackerHandler.
-  case object RequestWorkerHostPort extends IntraWorkerMessage
-  // Response to the above request
-  case class DivulgedWorkerHostPort(rank: Int, host: String, port: Int) extends IntraWorkerMessage
-  // A reminder to send ReduceWaitCount messages once the actor is in state "SetupComplete".
-  case class AcknowledgeAcceptance(peers: Map[Int, ActorRef], numBad: Int)
-    extends IntraWorkerMessage
-
-  // ---- End of message definitions ----
-
-  def props(host: String, worldSize: Int, tracker: ActorRef, connection: ActorRef): Props = {
-    Props(new RabitWorkerHandler(host, worldSize, tracker, connection))
-  }
-}
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/LinkMap.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/LinkMap.scala
deleted file mode 100644
index edec4931b..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/LinkMap.scala
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit.util
-
-import java.nio.{ByteBuffer, ByteOrder}
-
-/**
-  * The assigned rank to a connecting Rabit worker, along with the information of the ranks of
-  * its linked peer workers, which are critical to perform Allreduce.
-  * When RabitWorkerHandler delegates "start" or "recover" commands from the connecting worker
-  * client, RabitTrackerHandler utilizes LinkMap to figure out linkage relationships, and respond
-  * with this class as a message, which is later encoded to byte string, and sent over socket
-  * connection to the worker client.
-  *
-  * @param rank assigned rank (ranked by worker connection order: first worker connecting to the
-  *             tracker is assigned rank 0, second with rank 1, etc.)
-  * @param neighbors ranks of neighboring workers in a tree map.
-  * @param ring ranks of neighboring workers in a ring map.
-  * @param parent rank of the parent worker.
-  */
-private[rabit] case class AssignedRank(rank: Int, neighbors: Seq[Int],
-                                       ring: (Int, Int), parent: Int) {
-  /**
-    * Encode the AssignedRank message into byte sequence for socket communication with Rabit worker
-    * client.
-    * @param worldSize the number of total distributed workers. Must match `numWorkers` used in
-    *                  LinkMap.
-    * @return a ByteBuffer containing encoded data.
-    */
-  def toByteBuffer(worldSize: Int): ByteBuffer = {
-    val buffer = ByteBuffer.allocate(4 * (neighbors.length + 6)).order(ByteOrder.nativeOrder())
-    buffer.putInt(rank).putInt(parent).putInt(worldSize).putInt(neighbors.length)
-    // neighbors in tree structure
-    neighbors.foreach { n => buffer.putInt(n) }
-    buffer.putInt(if (ring._1 != -1 && ring._1 != rank) ring._1 else -1)
-    buffer.putInt(if (ring._2 != -1 && ring._2 != rank) ring._2 else -1)
-
-    buffer.flip()
-    buffer
-  }
-}
-
-private[rabit] class LinkMap(numWorkers: Int) {
-  private def getNeighbors(rank: Int): Seq[Int] = {
-    val rank1 = rank + 1
-    Vector(rank1 / 2 - 1, rank1 * 2 - 1, rank1 * 2).filter { r =>
-      r >= 0 && r < numWorkers
-    }
-  }
-
-  /**
-    * Construct a ring structure that tends to share nodes with the tree.
-    *
-    * @param treeMap
-    * @param parentMap
-    * @param rank
-    * @return Seq[Int] instance starting from rank.
-    */
-  private def constructShareRing(treeMap: Map[Int, Seq[Int]],
-                                 parentMap: Map[Int, Int],
-                                 rank: Int = 0): Seq[Int] = {
-    treeMap(rank).toSet - parentMap(rank) match {
-      case emptySet if emptySet.isEmpty =>
-        List(rank)
-      case connectionSet =>
-        connectionSet.zipWithIndex.foldLeft(List(rank)) {
-          case (ringSeq, (v, cnt)) =>
-            val vConnSeq = constructShareRing(treeMap, parentMap, v)
-            vConnSeq match {
-              case vconn if vconn.size == cnt + 1 =>
-                ringSeq ++ vconn.reverse
-              case vconn =>
-                ringSeq ++ vconn
-            }
-        }
-    }
-  }
-  /**
-    * Construct a ring connection used to recover local data.
-    *
-    * @param treeMap
-    * @param parentMap
-    */
-  private def constructRingMap(treeMap: Map[Int, Seq[Int]], parentMap: Map[Int, Int]) = {
-    assert(parentMap(0) == -1)
-
-    val sharedRing = constructShareRing(treeMap, parentMap, 0).toVector
-    assert(sharedRing.length == treeMap.size)
-
-    (0 until numWorkers).map { r =>
-      val rPrev = (r + numWorkers - 1) % numWorkers
-      val rNext = (r + 1) % numWorkers
-      sharedRing(r) -> (sharedRing(rPrev), sharedRing(rNext))
-    }.toMap
-  }
-
-  private[this] val treeMap_ = (0 until numWorkers).map { r => r -> getNeighbors(r) }.toMap
-  private[this] val parentMap_ = (0 until numWorkers).map{ r => r -> ((r + 1) / 2 - 1) }.toMap
-  private[this] val ringMap_ = constructRingMap(treeMap_, parentMap_)
-  val rMap_ = (0 until (numWorkers - 1)).foldLeft((Map(0 -> 0), 0)) {
-    case ((rmap, k), i) =>
-      val kNext = ringMap_(k)._2
-      (rmap ++ Map(kNext -> (i + 1)), kNext)
-  }._1
-
-  val ringMap = ringMap_.map {
-    case (k, (v0, v1)) => rMap_(k) -> (rMap_(v0), rMap_(v1))
-  }
-  val treeMap = treeMap_.map {
-    case (k, vSeq) => rMap_(k) -> vSeq.map{ v => rMap_(v) }
-  }
-  val parentMap = parentMap_.map {
-    case (k, v) if k == 0 =>
-      rMap_(k) -> -1
-    case (k, v) =>
-      rMap_(k) -> rMap_(v)
-  }
-
-  def assignRank(rank: Int): AssignedRank = {
-    AssignedRank(rank, treeMap(rank), ringMap(rank), parentMap(rank))
-  }
-}
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/RabitTrackerHelpers.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/RabitTrackerHelpers.scala
deleted file mode 100644
index 3d7be618d..000000000
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/rabit/util/RabitTrackerHelpers.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit.util
-
-import java.nio.{ByteOrder, ByteBuffer}
-import akka.util.ByteString
-
-private[rabit] object RabitTrackerHelpers {
-  implicit class ByteStringHelplers(bs: ByteString) {
-    // Java by default uses big endian. Enforce native endian so that
-    // the byte order is consistent with the workers.
-    def asNativeOrderByteBuffer: ByteBuffer = {
-      bs.asByteBuffer.order(ByteOrder.nativeOrder())
-    }
-  }
-
-  implicit class ByteBufferHelpers(buf: ByteBuffer) {
-    def getString: String = {
-      val len = buf.getInt()
-      val stringBuffer = ByteBuffer.allocate(len).order(ByteOrder.nativeOrder())
-      buf.get(stringBuffer.array(), 0, len)
-      new String(stringBuffer.array(), "utf-8")
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTrackerConnectionHandlerTest.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTrackerConnectionHandlerTest.scala
deleted file mode 100644
index cd9016812..000000000
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/rabit/RabitTrackerConnectionHandlerTest.scala
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.rabit
-
-import java.nio.{ByteBuffer, ByteOrder}
-
-import akka.actor.{ActorRef, ActorSystem}
-import akka.io.Tcp
-import akka.testkit.{ImplicitSender, TestFSMRef, TestKit, TestProbe}
-import akka.util.ByteString
-import ml.dmlc.xgboost4j.scala.rabit.handler.RabitWorkerHandler
-import ml.dmlc.xgboost4j.scala.rabit.handler.RabitWorkerHandler._
-import ml.dmlc.xgboost4j.scala.rabit.util.LinkMap
-import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{FlatSpecLike, Matchers}
-
-import scala.concurrent.Promise
-
-object RabitTrackerConnectionHandlerTest {
-  def intSeqToByteString(seq: Seq[Int]): ByteString = {
-    val buf = ByteBuffer.allocate(seq.length * 4).order(ByteOrder.nativeOrder())
-    seq.foreach { i => buf.putInt(i) }
-    buf.flip()
-    ByteString.fromByteBuffer(buf)
-  }
-}
-
-@RunWith(classOf[JUnitRunner])
-class RabitTrackerConnectionHandlerTest
-  extends TestKit(ActorSystem("RabitTrackerConnectionHandlerTest"))
-    with FlatSpecLike with Matchers with ImplicitSender {
-
-  import RabitTrackerConnectionHandlerTest._
-
-  val magic = intSeqToByteString(List(0xff99))
-
-  "RabitTrackerConnectionHandler" should "handle Rabit client 'start' command properly" in {
-    val trackerProbe = TestProbe()
-    val connProbe = TestProbe()
-
-    val worldSize = 4
-
-    val fsm = TestFSMRef(new RabitWorkerHandler("localhost", worldSize,
-      trackerProbe.ref, connProbe.ref))
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingHandshake
-
-    // send mock magic number
-    fsm ! Tcp.Received(magic)
-    connProbe.expectMsg(Tcp.Write(magic))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-    fsm.stateData shouldEqual RabitWorkerHandler.StructTrackerCommand
-    // ResumeReading should be seen once state transitions
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // send mock tracker command in fragments: the handler should be able to handle it.
-    val bufRank = ByteBuffer.allocate(8).order(ByteOrder.nativeOrder())
-    bufRank.putInt(0).putInt(worldSize).flip()
-
-    val bufJobId = ByteBuffer.allocate(5).order(ByteOrder.nativeOrder())
-    bufJobId.putInt(1).put(Array[Byte]('0')).flip()
-
-    val bufCmd = ByteBuffer.allocate(9).order(ByteOrder.nativeOrder())
-    bufCmd.putInt(5).put("start".getBytes()).flip()
-
-    fsm ! Tcp.Received(ByteString.fromByteBuffer(bufRank))
-    fsm ! Tcp.Received(ByteString.fromByteBuffer(bufJobId))
-
-    // the state should not change for incomplete command data.
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-
-    // send the last fragment, and expect message at tracker actor.
-    fsm ! Tcp.Received(ByteString.fromByteBuffer(bufCmd))
-    trackerProbe.expectMsg(WorkerStart(0, worldSize, "0"))
-
-    val linkMap = new LinkMap(worldSize)
-    val assignedRank = linkMap.assignRank(0)
-    trackerProbe.reply(assignedRank)
-
-    connProbe.expectMsg(Tcp.Write(ByteString.fromByteBuffer(
-      assignedRank.toByteBuffer(worldSize)
-    )))
-
-    // reading should be suspended upon transitioning to BuildingLinkMap
-    connProbe.expectMsg(Tcp.SuspendReading)
-    // state should transition with according state data changes.
-    fsm.stateName shouldEqual RabitWorkerHandler.BuildingLinkMap
-    fsm.stateData shouldEqual RabitWorkerHandler.StructNodes
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // since the connection handler in test has rank 0, it will not have any nodes to connect to.
-    fsm ! Tcp.Received(intSeqToByteString(List(0)))
-    trackerProbe.expectMsg(RequestAwaitConnWorkers(0, fsm.underlyingActor.getNeighboringWorkers))
-
-    // return mock response to the connection handler
-    val awaitConnPromise = Promise[AwaitingConnections]()
-    awaitConnPromise.success(AwaitingConnections(Map.empty[Int, ActorRef],
-      fsm.underlyingActor.getNeighboringWorkers.size
-    ))
-    fsm ! awaitConnPromise.future
-    connProbe.expectMsg(Tcp.Write(
-      intSeqToByteString(List(0, fsm.underlyingActor.getNeighboringWorkers.size))
-    ))
-    connProbe.expectMsg(Tcp.SuspendReading)
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingErrorCount
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // send mock error count (0)
-    fsm ! Tcp.Received(intSeqToByteString(List(0)))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingPortNumber
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // simulate Tcp.PeerClosed event first, then Tcp.Received to test handling of async events.
-    fsm ! Tcp.PeerClosed
-    // state should not transition
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingPortNumber
-    fsm ! Tcp.Received(intSeqToByteString(List(32768)))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.SetupComplete
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    trackerProbe.expectMsg(RabitWorkerHandler.WorkerStarted("localhost", 0, 2))
-
-    val handlerStopProbe = TestProbe()
-    handlerStopProbe watch fsm
-
-    // simulate connections from other workers by mocking ReduceWaitCount commands
-    fsm ! RabitWorkerHandler.ReduceWaitCount(1)
-    fsm.stateName shouldEqual RabitWorkerHandler.SetupComplete
-    fsm ! RabitWorkerHandler.ReduceWaitCount(1)
-    trackerProbe.expectMsg(RabitWorkerHandler.DropFromWaitingList(0))
-    handlerStopProbe.expectTerminated(fsm)
-
-    // all done.
-  }
-
-  it should "forward print command to tracker" in {
-    val trackerProbe = TestProbe()
-    val connProbe = TestProbe()
-
-    val fsm = TestFSMRef(new RabitWorkerHandler("localhost", 4,
-      trackerProbe.ref, connProbe.ref))
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingHandshake
-
-    fsm ! Tcp.Received(magic)
-    connProbe.expectMsg(Tcp.Write(magic))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-    fsm.stateData shouldEqual RabitWorkerHandler.StructTrackerCommand
-    // ResumeReading should be seen once state transitions
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    val printCmd = WorkerTrackerPrint(0, 4, "print", "hello world!")
-    fsm ! Tcp.Received(printCmd.encode)
-
-    trackerProbe.expectMsg(printCmd)
-  }
-
-  it should "handle fragmented print command without throwing exception" in {
-    val trackerProbe = TestProbe()
-    val connProbe = TestProbe()
-
-    val fsm = TestFSMRef(new RabitWorkerHandler("localhost", 4,
-      trackerProbe.ref, connProbe.ref))
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingHandshake
-
-    fsm ! Tcp.Received(magic)
-    connProbe.expectMsg(Tcp.Write(magic))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-    fsm.stateData shouldEqual RabitWorkerHandler.StructTrackerCommand
-    // ResumeReading should be seen once state transitions
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    val printCmd = WorkerTrackerPrint(0, 4, "0", "fragmented!")
-    // 4 (rank: Int) + 4 (worldSize: Int) + (4+1) (jobId: String) + (4+5) (command: String) = 22
-    val (partialMessage, remainder) = printCmd.encode.splitAt(22)
-
-    // make sure that the partialMessage in itself is a valid command
-    val partialMsgBuf = ByteBuffer.allocate(22).order(ByteOrder.nativeOrder())
-    partialMsgBuf.put(partialMessage.asByteBuffer)
-    RabitWorkerHandler.StructTrackerCommand.verify(partialMsgBuf) shouldBe true
-
-    fsm ! Tcp.Received(partialMessage)
-    fsm ! Tcp.Received(remainder)
-
-    trackerProbe.expectMsg(printCmd)
-  }
-
-  it should "handle spill-over Tcp data correctly between state transition" in {
-    val trackerProbe = TestProbe()
-    val connProbe = TestProbe()
-
-    val worldSize = 4
-
-    val fsm = TestFSMRef(new RabitWorkerHandler("localhost", worldSize,
-      trackerProbe.ref, connProbe.ref))
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingHandshake
-
-    // send mock magic number
-    fsm ! Tcp.Received(magic)
-    connProbe.expectMsg(Tcp.Write(magic))
-
-    fsm.stateName shouldEqual RabitWorkerHandler.AwaitingCommand
-    fsm.stateData shouldEqual RabitWorkerHandler.StructTrackerCommand
-    // ResumeReading should be seen once state transitions
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // send mock tracker command in fragments: the handler should be able to handle it.
-    val bufCmd = ByteBuffer.allocate(26).order(ByteOrder.nativeOrder())
-    bufCmd.putInt(0).putInt(worldSize).putInt(1).put(Array[Byte]('0'))
-      .putInt(5).put("start".getBytes())
-      // spilled-over data
-      .putInt(0).flip()
-
-    // send data with 4 extra bytes corresponding to the next state.
-    fsm ! Tcp.Received(ByteString.fromByteBuffer(bufCmd))
-
-    trackerProbe.expectMsg(WorkerStart(0, worldSize, "0"))
-
-    val linkMap = new LinkMap(worldSize)
-    val assignedRank = linkMap.assignRank(0)
-    trackerProbe.reply(assignedRank)
-
-    connProbe.expectMsg(Tcp.Write(ByteString.fromByteBuffer(
-      assignedRank.toByteBuffer(worldSize)
-    )))
-
-    // reading should be suspended upon transitioning to BuildingLinkMap
-    connProbe.expectMsg(Tcp.SuspendReading)
-    // state should transition with according state data changes.
-    fsm.stateName shouldEqual RabitWorkerHandler.BuildingLinkMap
-    fsm.stateData shouldEqual RabitWorkerHandler.StructNodes
-    connProbe.expectMsg(Tcp.ResumeReading)
-
-    // the handler should be able to handle spill-over data, and stash it until state transition.
-    trackerProbe.expectMsg(RequestAwaitConnWorkers(0, fsm.underlyingActor.getNeighboringWorkers))
-  }
-}

From a84a1fde0209b17cb9ffca4d2ee3e2180e7b970e Mon Sep 17 00:00:00 2001
From: Emil Ejbyfeldt <eejbyfeldt@liveintent.com>
Date: Thu, 20 Apr 2023 16:16:56 +0200
Subject: [PATCH 05/34] [jvm-packages] Update scalatest to 3.2.15 (#8925)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 jvm-packages/pom.xml                                       | 4 ++--
 jvm-packages/xgboost4j-gpu/pom.xml                         | 2 +-
 .../ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala     | 4 ++--
 .../dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala   | 7 ++++---
 .../scala/spark/CommunicatorRobustnessSuite.scala          | 4 ++--
 .../scala/spark/DeterministicPartitioningSuite.scala       | 4 ++--
 .../scala/spark/ExternalCheckpointManagerSuite.scala       | 4 ++--
 .../xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala | 4 ++--
 .../xgboost4j/scala/spark/MissingValueHandlingSuite.scala  | 4 ++--
 .../ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala     | 5 +++--
 .../test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala | 5 +++--
 .../ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala   | 4 ++--
 .../ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala  | 5 +++--
 .../xgboost4j/scala/spark/XGBoostClassifierSuite.scala     | 4 ++--
 .../scala/spark/XGBoostCommunicatorRegressionSuite.scala   | 4 ++--
 .../dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala | 4 ++--
 .../dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala   | 4 ++--
 .../dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala | 4 ++--
 jvm-packages/xgboost4j-tester/generate_pom.py              | 2 +-
 jvm-packages/xgboost4j/pom.xml                             | 2 +-
 .../test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala  | 4 ++--
 .../ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala    | 4 ++--
 22 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index facb955ce..9d7e41651 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -495,13 +495,13 @@
         <dependency>
             <groupId>org.scalatest</groupId>
             <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.0.8</version>
+            <version>3.2.15</version>
             <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.scalactic</groupId>
             <artifactId>scalactic_${scala.binary.version}</artifactId>
-            <version>3.0.8</version>
+            <version>3.2.15</version>
             <scope>test</scope>
         </dependency>
     </dependencies>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 167635209..5ffe0588c 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -53,7 +53,7 @@
         <dependency>
             <groupId>org.scalatest</groupId>
             <artifactId>scalatest_${scala.binary.version}</artifactId>
-            <version>3.0.5</version>
+            <version>3.2.15</version>
             <scope>provided</scope>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala b/jvm-packages/xgboost4j-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
index ba8c5fa9a..28ac2207a 100644
--- a/jvm-packages/xgboost4j-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
+++ b/jvm-packages/xgboost4j-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/QuantileDMatrixSuite.scala
@@ -19,10 +19,10 @@ package ml.dmlc.xgboost4j.scala
 import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.Table
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import ml.dmlc.xgboost4j.gpu.java.CudfColumnBatch
 
-class QuantileDMatrixSuite extends FunSuite {
+class QuantileDMatrixSuite extends AnyFunSuite {
 
   test("QuantileDMatrix test") {
 
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
index 175e00b39..2a355e160 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
@@ -20,14 +20,15 @@ import java.nio.file.{Files, Path}
 import java.sql.{Date, Timestamp}
 import java.util.{Locale, TimeZone}
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.{GpuTestUtils, SparkConf}
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.sql.{Row, SparkSession}
 
-trait GpuTestSuite extends FunSuite with TmpFolderSuite {
+trait GpuTestSuite extends AnyFunSuite with TmpFolderSuite {
   import SparkSessionHolder.withSparkSession
 
   protected def getResourcePath(resource: String): String = {
@@ -200,7 +201,7 @@ trait GpuTestSuite extends FunSuite with TmpFolderSuite {
 
 }
 
-trait TmpFolderSuite extends BeforeAndAfterAll { self: FunSuite =>
+trait TmpFolderSuite extends BeforeAndAfterAll { self: AnyFunSuite =>
   protected var tempDir: Path = _
 
   override def beforeAll(): Unit = {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
index 04081c3fe..5445cd1bf 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/CommunicatorRobustnessSuite.scala
@@ -23,9 +23,9 @@ import scala.util.Random
 import ml.dmlc.xgboost4j.java.{Communicator, RabitTracker => PyRabitTracker}
 import ml.dmlc.xgboost4j.java.IRabitTracker.TrackerStatus
 import ml.dmlc.xgboost4j.scala.DMatrix
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
-class CommunicatorRobustnessSuite extends FunSuite with PerTest {
+class CommunicatorRobustnessSuite extends AnyFunSuite with PerTest {
 
   private def getXGBoostExecutionParams(paramMap: Map[String, Any]): XGBoostExecutionParams = {
     val classifier = new XGBoostClassifier(paramMap)
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
index 61766b755..8d9723bb6 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
@@ -17,13 +17,13 @@
 package ml.dmlc.xgboost4j.scala.spark
 
 import org.apache.spark.ml.linalg.Vectors
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import ml.dmlc.xgboost4j.scala.spark.util.DataUtils
 import ml.dmlc.xgboost4j.scala.spark.util.DataUtils.PackedParams
 
 import org.apache.spark.sql.functions._
 
-class DeterministicPartitioningSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+class DeterministicPartitioningSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
 
   test("perform deterministic partitioning when checkpointInternal and" +
     " checkpointPath is set (Classifier)") {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
index cdcfd76f5..adc9c1068 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
@@ -19,10 +19,10 @@ package ml.dmlc.xgboost4j.scala.spark
 import java.io.File
 
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, ExternalCheckpointManager, XGBoost => SXGBoost}
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-class ExternalCheckpointManagerSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
 
   private def produceParamMap(checkpointPath: String, checkpointInterval: Int):
   Map[String, Any] = {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
index e0151dde3..789fd162b 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
@@ -18,12 +18,12 @@ package ml.dmlc.xgboost4j.scala.spark
 
 import org.apache.spark.Partitioner
 import org.apache.spark.ml.feature.VectorAssembler
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import org.apache.spark.sql.functions._
 
 import scala.util.Random
 
-class FeatureSizeValidatingSuite extends FunSuite with PerTest {
+class FeatureSizeValidatingSuite extends AnyFunSuite with PerTest {
 
   test("transform throwing exception if feature size of dataset is greater than model's") {
     val modelPath = getClass.getResource("/model/0.82/model").getPath
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
index 5863e2ace..6a7f7129d 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
@@ -19,12 +19,12 @@ package ml.dmlc.xgboost4j.scala.spark
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.sql.DataFrame
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import scala.util.Random
 
 import org.apache.spark.SparkException
 
-class MissingValueHandlingSuite extends FunSuite with PerTest {
+class MissingValueHandlingSuite extends AnyFunSuite with PerTest {
   test("dense vectors containing missing value") {
     def buildDenseDataFrame(): DataFrame = {
       val numRows = 100
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
index e3468b811..11b60e74d 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
@@ -16,12 +16,13 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.SparkException
 import org.apache.spark.ml.param.ParamMap
 
-class ParameterSuite extends FunSuite with PerTest with BeforeAndAfterAll {
+class ParameterSuite extends AnyFunSuite with PerTest with BeforeAndAfterAll {
 
   test("XGBoost and Spark parameters synchronize correctly") {
     val xgbParamMap = Map("eta" -> "1", "objective" -> "binary:logistic",
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
index e96618c51..24bc00e18 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
@@ -22,13 +22,14 @@ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.sql._
-import org.scalatest.{BeforeAndAfterEach, FunSuite}
+import org.scalatest.BeforeAndAfterEach
+import org.scalatest.funsuite.AnyFunSuite
 import scala.math.min
 import scala.util.Random
 
 import org.apache.commons.io.IOUtils
 
-trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
+trait PerTest extends BeforeAndAfterEach { self: AnyFunSuite =>
 
   protected val numWorkers: Int = min(Runtime.getRuntime.availableProcessors(), 4)
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
index cf8dcca57..5425b8647 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
@@ -25,9 +25,9 @@ import scala.util.Random
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.sql.functions._
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
-class PersistenceSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+class PersistenceSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
 
   test("test persistence of XGBoostClassifier and XGBoostClassificationModel") {
     val eval = new EvalError()
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala
index 96b74d679..bb523ffdf 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/TmpFolderPerSuite.scala
@@ -19,9 +19,10 @@ package ml.dmlc.xgboost4j.scala.spark
 import java.nio.file.{Files, Path}
 
 import org.apache.spark.network.util.JavaUtils
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
-trait TmpFolderPerSuite extends BeforeAndAfterAll { self: FunSuite =>
+trait TmpFolderPerSuite extends BeforeAndAfterAll { self: AnyFunSuite =>
   protected var tempDir: Path = _
 
   override def beforeAll(): Unit = {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
index f31207b9f..0031be9c7 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -22,13 +22,13 @@ import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
 
 import org.apache.spark.ml.linalg._
 import org.apache.spark.sql._
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import org.apache.commons.io.IOUtils
 
 import org.apache.spark.Partitioner
 import org.apache.spark.ml.feature.VectorAssembler
 
-class XGBoostClassifierSuite extends FunSuite with PerTest with TmpFolderPerSuite {
+class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
 
   protected val treeMethod: String = "auto"
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
index a7310f1ab..86b82e63c 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostCommunicatorRegressionSuite.scala
@@ -21,11 +21,11 @@ import ml.dmlc.xgboost4j.scala.Booster
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql._
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.SparkException
 
-class XGBoostCommunicatorRegressionSuite extends FunSuite with PerTest {
+class XGBoostCommunicatorRegressionSuite extends AnyFunSuite with PerTest {
   val predictionErrorMin = 0.00001f
   val maxFailure = 2;
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
index 7d588d97c..086fda2d7 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
@@ -19,9 +19,9 @@ package ml.dmlc.xgboost4j.scala.spark
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
 
 import org.apache.spark.sql._
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
-class XGBoostConfigureSuite extends FunSuite with PerTest {
+class XGBoostConfigureSuite extends AnyFunSuite with PerTest {
 
   override def sparkSessionBuilder: SparkSession.Builder = super.sparkSessionBuilder
       .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
index 0bf8c2fbb..c1e34224c 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -22,12 +22,12 @@ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import ml.dmlc.xgboost4j.scala.DMatrix
 
 import org.apache.spark.{SparkException, TaskContext}
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.sql.functions.lit
 
-class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
+class XGBoostGeneralSuite extends AnyFunSuite with TmpFolderPerSuite with PerTest {
 
   test("distributed training with the specified worker number") {
     val trainingRDD = sc.parallelize(Classification.train)
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
index 4e3d59b25..efcb38cf6 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
@@ -23,11 +23,11 @@ import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{DataFrame, Row}
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
 import org.apache.spark.ml.feature.VectorAssembler
 
-class XGBoostRegressorSuite extends FunSuite with PerTest with TmpFolderPerSuite {
+class XGBoostRegressorSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
   protected val treeMethod: String = "auto"
 
   test("XGBoost-Spark XGBoostRegressor output should match XGBoost4j") {
diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py
index edc9759bd..06372e9b2 100644
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@@ -69,7 +69,7 @@ pom_template = """
     <dependency>
       <groupId>org.scalactic</groupId>
       <artifactId>scalactic_${{scala.binary.version}}</artifactId>
-      <version>3.0.8</version>
+      <version>3.2.15</version>
       <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index aa8694751..7c5c33e87 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -46,7 +46,7 @@
         <dependency>
           <groupId>org.scalatest</groupId>
           <artifactId>scalatest_${scala.binary.version}</artifactId>
-          <version>3.0.5</version>
+          <version>3.2.15</version>
           <scope>provided</scope>
         </dependency>
     </dependencies>
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
index 05200f49e..05c6856f7 100644
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
@@ -20,10 +20,10 @@ import java.util.Arrays
 
 import scala.util.Random
 
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
 
-class DMatrixSuite extends FunSuite {
+class DMatrixSuite extends AnyFunSuite {
   test("create DMatrix from File") {
     val dmat = new DMatrix("../../demo/data/agaricus.txt.test")
     // get label
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
index 157971f82..8cac9fe4f 100644
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
@@ -20,11 +20,11 @@ import java.io.{FileOutputStream, FileInputStream, File}
 
 import junit.framework.TestCase
 import org.apache.commons.logging.LogFactory
-import org.scalatest.FunSuite
+import org.scalatest.funsuite.AnyFunSuite
 
 import ml.dmlc.xgboost4j.java.XGBoostError
 
-class ScalaBoosterImplSuite extends FunSuite {
+class ScalaBoosterImplSuite extends AnyFunSuite {
 
   private class EvalError extends EvalTrait {
 

From 2acd78b44b4d33a0089d45de365a04efd17ab7c9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 21 Apr 2023 00:10:45 +0800
Subject: [PATCH 06/34] Bump maven-project-info-reports-plugin in
 /jvm-packages/xgboost4j (#9049)

Bumps [maven-project-info-reports-plugin](https://github.com/apache/maven-project-info-reports-plugin) from 3.4.2 to 3.4.3.
- [Release notes](https://github.com/apache/maven-project-info-reports-plugin/releases)
- [Commits](https://github.com/apache/maven-project-info-reports-plugin/compare/maven-project-info-reports-plugin-3.4.2...maven-project-info-reports-plugin-3.4.3)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-project-info-reports-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 9d7e41651..8a6032984 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -450,7 +450,7 @@
         <plugins>
             <plugin>
                 <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.4.2</version>
+                <version>3.4.3</version>
             </plugin>
             <plugin>
                 <groupId>net.alchim31.maven</groupId>

From a7b3dd3176042bdec549a7df3424a09c6b0f0eaa Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 21 Apr 2023 02:26:47 +0800
Subject: [PATCH 07/34] Fix compiler warnings. (#9055)

---
 src/collective/aggregator.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index ee499b4d1..12222cf9d 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -32,23 +32,23 @@ namespace collective {
  * @param function The function used to calculate the results.
  * @param args Arguments to the function.
  */
-template <typename Function, typename... Args>
-void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function,
+template <typename Function, typename T, typename... Args>
+void ApplyWithLabels(MetaInfo const& info, T* buffer, size_t size, Function&& function,
                      Args&&... args) {
   if (info.IsVerticalFederated()) {
     // We assume labels are only available on worker 0, so the calculation is done there and result
     // broadcast to other workers.
-    std::vector<char> message(1024);
+    std::string message;
     if (collective::GetRank() == 0) {
       try {
         std::forward<Function>(function)(std::forward<Args>(args)...);
       } catch (dmlc::Error& e) {
-        strncpy(&message[0], e.what(), message.size());
-        message.back() = '\0';
+        message = e.what();
       }
     }
-    collective::Broadcast(&message[0], message.size(), 0);
-    if (strlen(&message[0]) == 0) {
+
+    collective::Broadcast(&message, 0);
+    if (message.empty()) {
       collective::Broadcast(buffer, size, 0);
     } else {
       LOG(FATAL) << &message[0];
@@ -57,6 +57,5 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
     std::forward<Function>(function)(std::forward<Args>(args)...);
   }
 }
-
 }  // namespace collective
 }  // namespace xgboost

From a5cd2412de3b9715c963c2b3d1b66c81bdb03640 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 20 Apr 2023 13:51:39 -0700
Subject: [PATCH 08/34] Replace setup.py with pyproject.toml (#9021)

* Create pyproject.toml
* Implement a custom build backend (see below) in packager directory. Build logic from setup.py has been refactored and migrated into the new backend.
* Tested: pip wheel . (build wheel), python -m build --sdist . (source distribution)
---
 .github/workflows/python_tests.yml       |  13 +-
 CMakeLists.txt                           |  10 +-
 dev/release-artifacts.py                 |   2 +-
 doc/build.rst                            | 172 +++++-----
 doc/contrib/ci.rst                       |   5 +-
 doc/contrib/index.rst                    |   1 +
 doc/contrib/python_packaging.rst         |  83 +++++
 doc/install.rst                          |  17 +-
 plugin/federated/README.md               |   2 +-
 python-package/MANIFEST.in               |  56 ----
 python-package/hatch_build.py            |  22 ++
 python-package/packager/__init__.py      |   0
 python-package/packager/build_config.py  |  56 ++++
 python-package/packager/nativelib.py     | 157 +++++++++
 python-package/packager/pep517.py        | 157 +++++++++
 python-package/packager/sdist.py         |  27 ++
 python-package/packager/util.py          |  25 ++
 python-package/pyproject.toml            |  42 +++
 python-package/setup.py                  | 399 -----------------------
 python-package/xgboost/config.py         |   2 +-
 python-package/xgboost/plotting.py       |   6 +-
 tests/buildkite/build-cpu-arm64.sh       |   2 +-
 tests/buildkite/build-cuda.sh            |   2 +-
 tests/buildkite/build-win64-gpu.ps1      |  12 +-
 tests/ci_build/build_python_wheels.sh    |   2 +-
 tests/ci_build/change_version.py         |  14 +-
 tests/ci_build/conda_env/python_lint.yml |   1 +
 tests/ci_build/conda_env/sdist_test.yml  |   1 +
 tests/ci_build/insert_vcomp140.py        | 102 ------
 tests/ci_build/lint_python.py            |   2 +-
 tests/ci_build/test_python.sh            |   2 +-
 31 files changed, 716 insertions(+), 678 deletions(-)
 create mode 100644 doc/contrib/python_packaging.rst
 delete mode 100644 python-package/MANIFEST.in
 create mode 100644 python-package/hatch_build.py
 create mode 100644 python-package/packager/__init__.py
 create mode 100644 python-package/packager/build_config.py
 create mode 100644 python-package/packager/nativelib.py
 create mode 100644 python-package/packager/pep517.py
 create mode 100644 python-package/packager/sdist.py
 create mode 100644 python-package/packager/util.py
 create mode 100644 python-package/pyproject.toml
 delete mode 100644 python-package/setup.py
 delete mode 100644 tests/ci_build/insert_vcomp140.py

diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 0d8e6d653..78a17d3f7 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -65,7 +65,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py sdist
+        python -m build --sdist
         pip install -v ./dist/xgboost-*.tar.gz
         cd ..
         python -c 'import xgboost'
@@ -92,6 +92,9 @@ jobs:
         auto-update-conda: true
         python-version: ${{ matrix.python-version }}
         activate-environment: test
+    - name: Install build
+      run: |
+        conda install -c conda-forge python-build
     - name: Display Conda env
       run: |
         conda info
@@ -100,7 +103,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py sdist
+        python -m build --sdist
         pip install -v ./dist/xgboost-*.tar.gz
         cd ..
         python -c 'import xgboost'
@@ -147,7 +150,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py install
+        pip install -v .
 
     - name: Test Python package
       run: |
@@ -194,7 +197,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py bdist_wheel --universal
+        pip wheel -v . --wheel-dir dist/
         pip install ./dist/*.whl
 
     - name: Test Python package
@@ -238,7 +241,7 @@ jobs:
       run: |
         cd python-package
         python --version
-        python setup.py install
+        pip install -v .
 
     - name: Test Python package
       run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5a61c60b..7953a10dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
 set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
 option(RABIT_MOCK "Build rabit with mock" OFF)
 option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
+option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binary dir" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
@@ -268,8 +269,13 @@ if (JVM_BINDINGS)
   xgboost_target_defs(xgboost4j)
 endif (JVM_BINDINGS)
 
-set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
-set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
+if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
+  set_output_directory(runxgboost ${xgboost_BINARY_DIR})
+  set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib)
+else ()
+  set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
+  set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
+endif ()
 # Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
 add_dependencies(xgboost runxgboost)
 
diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index 18c317a91..eab64ff0c 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -105,7 +105,7 @@ def make_pysrc_wheel(release: str, outdir: str) -> None:
         os.mkdir(dist)
 
     with DirectoryExcursion(os.path.join(ROOT, "python-package")):
-        subprocess.check_call(["python", "setup.py", "sdist"])
+        subprocess.check_call(["python", "-m", "build", "--sdist"])
         src = os.path.join(DIST, f"xgboost-{release}.tar.gz")
         subprocess.check_call(["twine", "check", src])
         shutil.move(src, os.path.join(dist, f"xgboost-{release}.tar.gz"))
diff --git a/doc/build.rst b/doc/build.rst
index 53d9a3209..e78d2d2f4 100644
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -12,6 +12,7 @@ systems.  If the instructions do not work for you, please feel free to ask quest
   Consider installing XGBoost from a pre-built binary, to avoid the trouble of building XGBoost from the source.  Checkout :doc:`Installation Guide </install>`.
 
 .. contents:: Contents
+  :local:
 
 .. _get_source:
 
@@ -152,11 +153,11 @@ On Windows, run CMake as follows:
 
   mkdir build
   cd build
-  cmake .. -G"Visual Studio 14 2015 Win64" -DUSE_CUDA=ON
+  cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON
 
 (Change the ``-G`` option appropriately if you have a different version of Visual Studio installed.)
 
-The above cmake configuration run will create an ``xgboost.sln`` solution file in the build directory. Build this solution in release mode as a x64 build, either from Visual studio or from command line:
+The above cmake configuration run will create an ``xgboost.sln`` solution file in the build directory. Build this solution in Release mode, either from Visual studio or from command line:
 
 .. code-block:: bash
 
@@ -176,111 +177,104 @@ Building Python Package with Default Toolchains
 ===============================================
 There are several ways to build and install the package from source:
 
-1. Use Python setuptools directly
+1. Build C++ core with CMake first
 
-  The XGBoost Python package supports most of the setuptools commands, here is a list of tested commands:
+  You can first build C++ library using CMake as described in :ref:`build_shared_lib`.
+  After compilation, a shared library will appear in ``lib/`` directory.
+  On Linux distributions, the shared library is ``lib/libxgboost.so``.
+  The install script ``pip install .`` will reuse the shared library instead of compiling
+  it from scratch, making it quite fast to run.
+
+  .. code-block:: console
+
+    $ cd python-package/
+    $ pip install .  # Will re-use lib/libxgboost.so
+
+2. Install the Python package directly
+
+  You can navigate to ``python-package/`` directory and install the Python package directly
+  by running
+
+  .. code-block:: console
+
+    $ cd python-package/
+    $ pip install -v .
+
+  which will compile XGBoost's native (C++) code using default CMake flags.
+  To enable additional compilation options, pass corresponding ``--config-settings``:
+
+  .. code-block:: console
+
+    $ pip install -v . --config-settings use_cuda=True --config-settings use_nccl=True
+
+  Use Pip 22.1 or later to use ``--config-settings`` option.
+
+  Here are the available options for ``--config-settings``:
+
+  .. literalinclude:: ../python-package/packager/build_config.py
+    :language: python
+    :start-at: @dataclasses.dataclass
+    :end-before: def _set_config_setting(
+
+  ``use_system_libxgboost`` is a special option. See Item 4 below for
+  detailed description.
+
+  .. note:: Verbose flag recommended
+
+    As ``pip install .`` will build C++ code, it will take a while to complete.
+    To ensure that the build is progressing successfully, we suggest that
+    you add the verbose flag (``-v``) when invoking ``pip install``.
+
+
+3. Editable installation
+
+  To further enable rapid development and iteration, we provide an **editable installation**.
+  In an editable installation, the installed package is simply a symbolic link to your
+  working copy of the XGBoost source code. So every changes you make to your source
+  directory will be immediately visible to the Python interpreter. Here is how to
+  install XGBoost as editable installation:
 
   .. code-block:: bash
 
-    python setup.py install  # Install the XGBoost to your current Python environment.
-    python setup.py build    # Build the Python package.
-    python setup.py build_ext # Build only the C++ core.
-    python setup.py sdist     # Create a source distribution
-    python setup.py bdist     # Create a binary distribution
-    python setup.py bdist_wheel # Create a binary distribution with wheel format
-
-  Running ``python setup.py install`` will compile XGBoost using default CMake flags.  For
-  passing additional compilation options, append the flags to the command.  For example,
-  to enable CUDA acceleration and NCCL (distributed GPU) support:
-
-  .. code-block:: bash
-
-    python setup.py install --use-cuda --use-nccl
-
-  Please refer to ``setup.py`` for a complete list of available options.  Some other
-  options used for development are only available for using CMake directly.  See next
-  section on how to use CMake with setuptools manually.
-
-  You can install the created distribution packages using pip. For example, after running
-  ``sdist`` setuptools command, a tar ball similar to ``xgboost-1.0.0.tar.gz`` will be
-  created under the ``dist`` directory.  Then you can install it by invoking the following
-  command under ``dist`` directory:
-
-  .. code-block:: bash
-
-    # under python-package directory
-    cd dist
-    pip install ./xgboost-1.0.0.tar.gz
-
-
-  For details about these commands, please refer to the official document of `setuptools
-  <https://setuptools.readthedocs.io/en/latest/>`_, or just Google "how to install Python
-  package from source".  XGBoost Python package follows the general convention.
-  Setuptools is usually available with your Python distribution, if not you can install it
-  via system command.  For example on Debian or Ubuntu:
-
-  .. code-block:: bash
-
-    sudo apt-get install python-setuptools
-
-
-  For cleaning up the directory after running above commands, ``python setup.py clean`` is
-  not sufficient.  After copying out the build result, simply running ``git clean -xdf``
-  under ``python-package`` is an efficient way to remove generated cache files.  If you
-  find weird behaviors in Python build or running linter, it might be caused by those
-  cached files.
-
-  For using develop command (editable installation), see next section.
-
-  .. code-block::
-
-    python setup.py develop   # Create a editable installation.
-    pip install -e .          # Same as above, but carried out by pip.
-
-
-2. Build C++ core with CMake first
-
-  This is mostly for C++ developers who don't want to go through the hooks in Python
-  setuptools.  You can build C++ library directly using CMake as described in above
-  sections.  After compilation, a shared object (or called dynamic linked library, jargon
-  depending on your platform) will appear in XGBoost's source tree under ``lib/``
-  directory.  On Linux distributions it's ``lib/libxgboost.so``.  From there all Python
-  setuptools commands will reuse that shared object instead of compiling it again.  This
-  is especially convenient if you are using the editable installation, where the installed
-  package is simply a link to the source tree.  We can perform rapid testing during
-  development.  Here is a simple bash script does that:
-
-  .. code-block:: bash
-
-    # Under xgboost source tree.
+    # Under xgboost source directory
     mkdir build
     cd build
-    cmake ..
-    make -j$(nproc)
+    # Build shared library libxgboost.so
+    cmake .. -GNinja
+    ninja
+    # Install as editable installation
     cd ../python-package
-    pip install -e .  # or equivalently python setup.py develop
+    pip install -e .
 
-3. Use ``libxgboost.so`` on system path.
+4. Use ``libxgboost.so`` on system path.
 
-  This is for distributing xgboost in a language independent manner, where
-  ``libxgboost.so`` is separately packaged with Python package.  Assuming `libxgboost.so`
-  is already presented in system library path, which can be queried via:
+  This option is useful for package managers that wish to separately package
+  ``libxgboost.so`` and the XGBoost Python package. For example, Conda
+  publishes ``libxgboost`` (for the shared library) and ``py-xgboost``
+  (for the Python package).
+
+  To use this option, first make sure that ``libxgboost.so`` exists in the system library path:
 
   .. code-block:: python
 
     import sys
-    import os
-    os.path.join(sys.prefix, 'lib')
+    import pathlib
+    libpath = pathlib.Path(sys.prefix).joinpath("lib", "libxgboost.so")
+    assert libpath.exists()
 
-  Then one only needs to provide an user option when installing Python package to reuse the
-  shared object in system path:
+  Then pass ``use_system_libxgboost=True`` option to ``pip install``:
 
   .. code-block:: bash
 
-    cd xgboost/python-package
-    python setup.py install --use-system-libxgboost
+    cd python-package
+    pip install . --config-settings use_system_libxgboost=True
 
 
+.. note::
+
+  See :doc:`contrib/python_packaging` for instructions on packaging
+  and distributing XGBoost as Python distributions.
+
 .. _python_mingw:
 
 Building Python Package for Windows with MinGW-w64 (Advanced)
@@ -297,7 +291,7 @@ So you may want to build XGBoost with GCC own your own risk. This presents some
 2. ``-O3`` is OK.
 3. ``-mtune=native`` is also OK.
 4. Don't use ``-march=native`` gcc flag. Using it causes the Python interpreter to crash if the DLL was actually used.
-5. You may need to provide the lib with the runtime libs. If ``mingw32/bin`` is not in ``PATH``, build a wheel (``python setup.py bdist_wheel``), open it with an archiver and put the needed dlls to the directory where ``xgboost.dll`` is situated. Then you can install the wheel with ``pip``.
+5. You may need to provide the lib with the runtime libs. If ``mingw32/bin`` is not in ``PATH``, build a wheel (``pip wheel``), open it with an archiver and put the needed dlls to the directory where ``xgboost.dll`` is situated. Then you can install the wheel with ``pip``.
 
 ******************************
 Building R Package From Source
diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
index 6073e646a..76e06de35 100644
--- a/doc/contrib/ci.rst
+++ b/doc/contrib/ci.rst
@@ -35,8 +35,9 @@ calls ``cibuildwheel`` to build the wheel. The ``cibuildwheel`` is a library tha
 suitable Python environment for each OS and processor target. Since we don't have Apple Silion
 machine in GitHub Actions, cross-compilation is needed; ``cibuildwheel`` takes care of the complex
 task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call
-``setup.py bdist_wheel``. Since XGBoost has a native library component, ``setup.py`` contains
-a glue code to call CMake and a C++ compiler to build the native library on the fly.)
+``pip wheel``. Since XGBoost has a native library component, we created a customized build
+backend that hooks into ``pip``. The customized backend contains the glue code to compile the native
+library on the fly.)
 
 *********************************************************
 Reproduce CI testing environments using Docker containers
diff --git a/doc/contrib/index.rst b/doc/contrib/index.rst
index c9c5f93a2..6a36cb108 100644
--- a/doc/contrib/index.rst
+++ b/doc/contrib/index.rst
@@ -23,6 +23,7 @@ Here are guidelines for contributing to various aspect of the XGBoost project:
   Community Guideline <community>
   donate
   coding_guide
+  python_packaging
   unit_tests
   Docs and Examples <docs>
   git_guide
diff --git a/doc/contrib/python_packaging.rst b/doc/contrib/python_packaging.rst
new file mode 100644
index 000000000..5cf085685
--- /dev/null
+++ b/doc/contrib/python_packaging.rst
@@ -0,0 +1,83 @@
+###########################################
+Notes on packaging XGBoost's Python package
+###########################################
+
+
+.. contents:: Contents
+  :local:
+
+.. _packaging_python_xgboost:
+
+***************************************************
+How to build binary wheels and source distributions
+***************************************************
+
+Wheels and source distributions (sdist for short) are the two main
+mechanisms for packaging and distributing Python packages.
+
+* A **source distribution** (sdist) is a tarball (``.tar.gz`` extension) that
+  contains the source code.
+* A **wheel** is a ZIP-compressed archive (with ``.whl`` extension)
+  representing a *built* distribution. Unlike an sdist, a wheel can contain
+  compiled components. The compiled components are compiled prior to distribution,
+  making it more convenient for end-users to install a wheel. Wheels containing
+  compiled components are referred to as **binary wheels**.
+
+See `Python Packaging User Guide <https://packaging.python.org/en/latest/>`_
+to learn more about how Python packages in general are packaged and
+distributed.
+
+For the remainder of this document, we will focus on packaging and
+distributing XGBoost.
+
+Building sdists
+===============
+
+In the case of XGBoost, an sdist contains both the Python code as well as
+the C++ code, so that the core part of XGBoost can be compiled into the
+shared libary ``libxgboost.so`` [#shared_lib_name]_.
+
+You can obtain an sdist as follows:
+
+.. code-block:: console
+
+  $ python -m build --sdist .
+
+(You'll need to install the ``build`` package first:
+``pip install build`` or ``conda install python-build``.)
+
+Running ``pip install`` with an sdist will launch CMake and a C++ compiler
+to compile the bundled C++ code into ``libxgboost.so``:
+
+.. code-block:: console
+
+  $ pip install -v xgboost-2.0.0.tar.gz  # Add -v to show build progress
+
+Building binary wheels
+======================
+
+You can also build a wheel as follows:
+
+.. code-block:: console
+
+   $ pip wheel --no-deps -v .
+
+Notably, the resulting wheel contains a copy of the shared library
+``libxgboost.so`` [#shared_lib_name]_. The wheel is a **binary wheel**,
+since it contains a compiled binary.
+
+
+Running ``pip install`` with the binary wheel will extract the content of
+the wheel into the current Python environment. Since the wheel already
+contains a pre-built copy of ``libxgboost.so``, it does not have to be
+built at the time of install. So ``pip install`` with the binary wheel
+completes quickly:
+
+.. code-block:: console
+  
+  $ pip install xgboost-2.0.0-py3-none-linux_x86_64.whl  # Completes quickly
+
+.. rubric:: Footnotes
+
+.. [#shared_lib_name] The name of the shared library file will differ
+   depending on the operating system in use. See :ref:`build_shared_lib`.
diff --git a/doc/install.rst b/doc/install.rst
index 03daf465f..0e155f647 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -16,15 +16,28 @@ Stable Release
 Python
 ------
 
-Pre-built binary are uploaded to PyPI (Python Package Index) for each release.  Supported platforms are Linux (x86_64, aarch64), Windows (x86_64) and MacOS (x86_64, Apple Silicon).
+Pre-built binary wheels are uploaded to PyPI (Python Package Index) for each release. Supported platforms are Linux (x86_64, aarch64), Windows (x86_64) and MacOS (x86_64, Apple Silicon).
 
 .. code-block:: bash
 
+  # Pip 21.3+ is required
   pip install xgboost
 
 
 You might need to run the command with ``--user`` flag or use ``virtualenv`` if you run
-into permission errors.  Python pre-built binary capability for each platform:
+into permission errors.
+
+.. note:: Windows users need to install Visual C++ Redistributable
+
+  XGBoost requires DLLs from `Visual C++ Redistributable
+  <https://www.microsoft.com/en-us/download/details.aspx?id=48145>`_
+  in order to function, so make sure to install it. Exception: If
+  you have Visual Studio installed, you already have access to
+  necessary libraries and thus don't need to install Visual C++
+  Redistributable.
+
+
+Capabilities of binary wheels for each platform:
 
 .. |tick| unicode:: U+2714
 .. |cross| unicode:: U+2718
diff --git a/plugin/federated/README.md b/plugin/federated/README.md
index d83db6be1..631c44cee 100644
--- a/plugin/federated/README.md
+++ b/plugin/federated/README.md
@@ -19,7 +19,7 @@ cmake .. -GNinja \
  -DUSE_NCCL=ON
 ninja
 cd ../python-package
-pip install -e .  # or equivalently python setup.py develop
+pip install -e .
 ```
 If CMake fails to locate gRPC, you may need to pass `-DCMAKE_PREFIX_PATH=<grpc path>` to CMake.
 
diff --git a/python-package/MANIFEST.in b/python-package/MANIFEST.in
deleted file mode 100644
index 23f2684c2..000000000
--- a/python-package/MANIFEST.in
+++ /dev/null
@@ -1,56 +0,0 @@
-include README.rst
-include xgboost/LICENSE
-include xgboost/VERSION
-include xgboost/CMakeLists.txt
-
-include xgboost/py.typed
-recursive-include xgboost *.py
-recursive-include xgboost/cmake *
-exclude xgboost/cmake/RPackageInstall.cmake.in
-exclude xgboost/cmake/RPackageInstallTargetSetup.cmake
-exclude xgboost/cmake/Sanitizer.cmake
-exclude xgboost/cmake/modules/FindASan.cmake
-exclude xgboost/cmake/modules/FindLSan.cmake
-exclude xgboost/cmake/modules/FindLibR.cmake
-exclude xgboost/cmake/modules/FindTSan.cmake
-exclude xgboost/cmake/modules/FindUBSan.cmake
-recursive-include xgboost/include *
-recursive-include xgboost/plugin *
-recursive-include xgboost/src *
-
-recursive-include xgboost/gputreeshap/GPUTreeShap *
-
-include xgboost/rabit/CMakeLists.txt
-recursive-include xgboost/rabit/include *
-recursive-include xgboost/rabit/src *
-prune xgboost/rabit/doc
-prune xgboost/rabit/guide
-
-include xgboost/dmlc-core/CMakeLists.txt
-
-recursive-include xgboost/dmlc-core/cmake *
-exclude xgboost/dmlc-core/cmake/gtest_cmake.in
-exclude xgboost/dmlc-core/cmake/lint.cmake
-exclude xgboost/dmlc-core/cmake/Sanitizer.cmake
-exclude xgboost/dmlc-core/cmake/Modules/FindASan.cmake
-exclude xgboost/dmlc-core/cmake/Modules/FindLSan.cmake
-exclude xgboost/dmlc-core/cmake/Modules/FindTSan.cmake
-exclude xgboost/dmlc-core/cmake/Modules/FindUBSan.cmake
-
-recursive-include xgboost/dmlc-core/include *
-recursive-include xgboost/dmlc-core/include *
-recursive-include xgboost/dmlc-core/make *
-recursive-include xgboost/dmlc-core/src *
-include xgboost/dmlc-core/tracker/dmlc-submit
-recursive-include xgboost/dmlc-core/tracker/dmlc_tracker *.py
-include xgboost/dmlc-core/tracker/yarn/build.bat
-include xgboost/dmlc-core/tracker/yarn/build.sh
-include xgboost/dmlc-core/tracker/yarn/pom.xml
-recursive-include xgboost/dmlc-core/tracker/yarn/src *
-include xgboost/dmlc-core/windows/dmlc.sln
-include xgboost/dmlc-core/windows/dmlc/dmlc.vcxproj
-
-prune xgboost/dmlc-core/doc
-prune xgboost/dmlc-core/scripts/
-
-global-exclude *.py[oc]
diff --git a/python-package/hatch_build.py b/python-package/hatch_build.py
new file mode 100644
index 000000000..696787fa2
--- /dev/null
+++ b/python-package/hatch_build.py
@@ -0,0 +1,22 @@
+"""
+Custom hook to customize the behavior of Hatchling.
+Here, we customize the tag of the generated wheels.
+"""
+import sysconfig
+from typing import Any, Dict
+
+from hatchling.builders.hooks.plugin.interface import BuildHookInterface
+
+
+def get_tag() -> str:
+    """Get appropriate wheel tag according to system"""
+    tag_platform = sysconfig.get_platform().replace("-", "_").replace(".", "_")
+    return f"py3-none-{tag_platform}"
+
+
+class CustomBuildHook(BuildHookInterface):
+    """A custom build hook"""
+
+    def initialize(self, version: str, build_data: Dict[str, Any]) -> None:
+        """This step ccurs immediately before each build."""
+        build_data["tag"] = get_tag()
diff --git a/python-package/packager/__init__.py b/python-package/packager/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python-package/packager/build_config.py b/python-package/packager/build_config.py
new file mode 100644
index 000000000..290cf15db
--- /dev/null
+++ b/python-package/packager/build_config.py
@@ -0,0 +1,56 @@
+"""Build configuration"""
+import dataclasses
+from typing import Any, Dict, List, Optional
+
+
+@dataclasses.dataclass
+class BuildConfiguration:  # pylint: disable=R0902
+    """Configurations use when building libxgboost"""
+
+    # Whether to hide C++ symbols in libxgboost.so
+    hide_cxx_symbols: bool = True
+    # Whether to enable OpenMP
+    use_openmp: bool = True
+    # Whether to enable CUDA
+    use_cuda: bool = False
+    # Whether to enable NCCL
+    use_nccl: bool = False
+    # Whether to enable HDFS
+    use_hdfs: bool = False
+    # Whether to enable Azure Storage
+    use_azure: bool = False
+    # Whether to enable AWS S3
+    use_s3: bool = False
+    # Whether to enable the dense parser plugin
+    plugin_dense_parser: bool = False
+    # Special option: See explanation below
+    use_system_libxgboost: bool = False
+
+    def _set_config_setting(
+        self, config_settings: Dict[str, Any], field_name: str
+    ) -> None:
+        if field_name in config_settings:
+            setattr(
+                self,
+                field_name,
+                (config_settings[field_name].lower() in ["true", "1", "on"]),
+            )
+        else:
+            raise ValueError(f"Field {field_name} is not a valid config_settings")
+
+    def update(self, config_settings: Optional[Dict[str, Any]]) -> None:
+        """Parse config_settings from Pip (or other PEP 517 frontend)"""
+        if config_settings is not None:
+            for field_name in [x.name for x in dataclasses.fields(self)]:
+                self._set_config_setting(config_settings, field_name)
+
+    def get_cmake_args(self) -> List[str]:
+        """Convert build configuration to CMake args"""
+        cmake_args = []
+        for field_name in [x.name for x in dataclasses.fields(self)]:
+            if field_name in ["use_system_libxgboost"]:
+                continue
+            cmake_option = field_name.upper()
+            cmake_value = "ON" if getattr(self, field_name) is True else "OFF"
+            cmake_args.append(f"-D{cmake_option}={cmake_value}")
+        return cmake_args
diff --git a/python-package/packager/nativelib.py b/python-package/packager/nativelib.py
new file mode 100644
index 000000000..f7f5b4e79
--- /dev/null
+++ b/python-package/packager/nativelib.py
@@ -0,0 +1,157 @@
+"""
+Functions for building libxgboost
+"""
+import logging
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+from platform import system
+from typing import Optional
+
+from .build_config import BuildConfiguration
+
+
+def _lib_name() -> str:
+    """Return platform dependent shared object name."""
+    if system() in ["Linux", "OS400"] or system().upper().endswith("BSD"):
+        name = "libxgboost.so"
+    elif system() == "Darwin":
+        name = "libxgboost.dylib"
+    elif system() == "Windows":
+        name = "xgboost.dll"
+    else:
+        raise NotImplementedError(f"System {system()} not supported")
+    return name
+
+
+def build_libxgboost(
+    cpp_src_dir: pathlib.Path,
+    build_dir: pathlib.Path,
+    build_config: BuildConfiguration,
+) -> pathlib.Path:
+    """Build libxgboost in a temporary directory and obtain the path to built libxgboost"""
+    logger = logging.getLogger("xgboost.packager.build_libxgboost")
+
+    if not cpp_src_dir.is_dir():
+        raise RuntimeError(f"Expected {cpp_src_dir} to be a directory")
+    logger.info(
+        "Building %s from the C++ source files in %s...", _lib_name(), str(cpp_src_dir)
+    )
+
+    def _build(*, generator: str) -> None:
+        cmake_cmd = [
+            "cmake",
+            str(cpp_src_dir),
+            generator,
+            "-DKEEP_BUILD_ARTIFACTS_IN_BINARY_DIR=ON",
+        ]
+        cmake_cmd.extend(build_config.get_cmake_args())
+
+        # Flag for cross-compiling for Apple Silicon
+        # We use environment variable because it's the only way to pass down custom flags
+        # through the cibuildwheel package, which calls `pip wheel` command.
+        if "CIBW_TARGET_OSX_ARM64" in os.environ:
+            cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")
+
+        logger.info("CMake args: %s", str(cmake_cmd))
+        subprocess.check_call(cmake_cmd, cwd=build_dir)
+
+        if system() == "Windows":
+            subprocess.check_call(
+                ["cmake", "--build", ".", "--config", "Release"], cwd=build_dir
+            )
+        else:
+            nproc = os.cpu_count()
+            assert build_tool is not None
+            subprocess.check_call([build_tool, f"-j{nproc}"], cwd=build_dir)
+
+    if system() == "Windows":
+        supported_generators = (
+            "-GVisual Studio 17 2022",
+            "-GVisual Studio 16 2019",
+            "-GVisual Studio 15 2017",
+            "-GMinGW Makefiles",
+        )
+        for generator in supported_generators:
+            try:
+                _build(generator=generator)
+                logger.info(
+                    "Successfully built %s using generator %s", _lib_name(), generator
+                )
+                break
+            except subprocess.CalledProcessError as e:
+                logger.info(
+                    "Tried building with generator %s but failed with exception %s",
+                    generator,
+                    str(e),
+                )
+                # Empty build directory
+                shutil.rmtree(build_dir)
+                build_dir.mkdir()
+        else:
+            raise RuntimeError(
+                "None of the supported generators produced a successful build!"
+                f"Supported generators: {supported_generators}"
+            )
+    else:
+        build_tool = "ninja" if shutil.which("ninja") else "make"
+        generator = "-GNinja" if build_tool == "ninja" else "-GUnix Makefiles"
+        try:
+            _build(generator=generator)
+        except subprocess.CalledProcessError as e:
+            logger.info("Failed to build with OpenMP. Exception: %s", str(e))
+            build_config.use_openmp = False
+            _build(generator=generator)
+
+    return build_dir / "lib" / _lib_name()
+
+
+def locate_local_libxgboost(
+    toplevel_dir: pathlib.Path,
+    logger: logging.Logger,
+) -> Optional[pathlib.Path]:
+    """
+    Locate libxgboost from the local project directory's lib/ subdirectory.
+    """
+    libxgboost = toplevel_dir.parent / "lib" / _lib_name()
+    if libxgboost.exists():
+        logger.info("Found %s at %s", libxgboost.name, str(libxgboost.parent))
+        return libxgboost
+    return None
+
+
+def locate_or_build_libxgboost(
+    toplevel_dir: pathlib.Path,
+    build_dir: pathlib.Path,
+    build_config: BuildConfiguration,
+) -> pathlib.Path:
+    """Locate libxgboost; if not exist, build it"""
+    logger = logging.getLogger("xgboost.packager.locate_or_build_libxgboost")
+
+    libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
+    if libxgboost is not None:
+        return libxgboost
+    if build_config.use_system_libxgboost:
+        # Find libxgboost from system prefix
+        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
+        libxgboost = sys_prefix / "lib" / _lib_name()
+        if not libxgboost.exists():
+            raise RuntimeError(
+                f"use_system_libxgboost was specified but {_lib_name()} is "
+                f"not found in {libxgboost.parent}"
+            )
+
+        logger.info("Using system XGBoost: %s", str(libxgboost))
+        return libxgboost
+
+    if toplevel_dir.joinpath("cpp_src").exists():
+        # Source distribution; all C++ source files to be found in cpp_src/
+        cpp_src_dir = toplevel_dir.joinpath("cpp_src")
+    else:
+        # Probably running "pip install ." from python-package/
+        cpp_src_dir = toplevel_dir.parent
+        if not cpp_src_dir.joinpath("CMakeLists.txt").exists():
+            raise RuntimeError(f"Did not find CMakeLists.txt from {cpp_src_dir}")
+    return build_libxgboost(cpp_src_dir, build_dir=build_dir, build_config=build_config)
diff --git a/python-package/packager/pep517.py b/python-package/packager/pep517.py
new file mode 100644
index 000000000..56583e117
--- /dev/null
+++ b/python-package/packager/pep517.py
@@ -0,0 +1,157 @@
+"""
+Custom build backend for XGBoost Python package.
+Builds source distribution and binary wheels, following PEP 517 / PEP 660.
+Reuses components of Hatchling (https://github.com/pypa/hatch/tree/master/backend) for the sake
+of brevity.
+"""
+import dataclasses
+import logging
+import os
+import pathlib
+import tempfile
+from contextlib import contextmanager
+from typing import Any, Dict, Iterator, Optional, Union
+
+import hatchling.build
+
+from .build_config import BuildConfiguration
+from .nativelib import locate_local_libxgboost, locate_or_build_libxgboost
+from .sdist import copy_cpp_src_tree
+from .util import copy_with_logging, copytree_with_logging
+
+
+@contextmanager
+def cd(path: Union[str, pathlib.Path]) -> Iterator[str]:  # pylint: disable=C0103
+    """
+    Temporarily change working directory.
+    TODO(hcho3): Remove this once we adopt Python 3.11, which implements contextlib.chdir.
+    """
+    path = str(path)
+    path = os.path.realpath(path)
+    cwd = os.getcwd()
+    os.chdir(path)
+    try:
+        yield path
+    finally:
+        os.chdir(cwd)
+
+
+TOPLEVEL_DIR = pathlib.Path(__file__).parent.parent.absolute().resolve()
+logging.basicConfig(level=logging.INFO)
+
+
+# Aliases
+get_requires_for_build_sdist = hatchling.build.get_requires_for_build_sdist
+get_requires_for_build_wheel = hatchling.build.get_requires_for_build_wheel
+get_requires_for_build_editable = hatchling.build.get_requires_for_build_editable
+
+
+def build_wheel(
+    wheel_directory: str,
+    config_settings: Optional[Dict[str, Any]] = None,
+    metadata_directory: Optional[str] = None,
+) -> str:
+    """Build a wheel"""
+    logger = logging.getLogger("xgboost.packager.build_wheel")
+
+    build_config = BuildConfiguration()
+    build_config.update(config_settings)
+    logger.info("Parsed build configuration: %s", dataclasses.asdict(build_config))
+
+    # Create tempdir with Python package + libxgboost
+    with tempfile.TemporaryDirectory() as td:
+        td_path = pathlib.Path(td)
+        build_dir = td_path / "libbuild"
+        build_dir.mkdir()
+
+        workspace = td_path / "whl_workspace"
+        workspace.mkdir()
+        logger.info("Copying project files to temporary directory %s", str(workspace))
+
+        copy_with_logging(TOPLEVEL_DIR / "pyproject.toml", workspace, logger=logger)
+        copy_with_logging(TOPLEVEL_DIR / "hatch_build.py", workspace, logger=logger)
+        copy_with_logging(TOPLEVEL_DIR / "README.rst", workspace, logger=logger)
+
+        pkg_path = workspace / "xgboost"
+        copytree_with_logging(TOPLEVEL_DIR / "xgboost", pkg_path, logger=logger)
+        lib_path = pkg_path / "lib"
+        lib_path.mkdir()
+        libxgboost = locate_or_build_libxgboost(
+            TOPLEVEL_DIR, build_dir=build_dir, build_config=build_config
+        )
+        copy_with_logging(libxgboost, lib_path, logger=logger)
+
+        with cd(workspace):
+            wheel_name = hatchling.build.build_wheel(
+                wheel_directory, config_settings, metadata_directory
+            )
+    return wheel_name
+
+
+def build_sdist(
+    sdist_directory: str,
+    config_settings: Optional[Dict[str, Any]] = None,
+) -> str:
+    """Build a source distribution"""
+    logger = logging.getLogger("xgboost.packager.build_sdist")
+
+    if config_settings:
+        raise NotImplementedError(
+            "XGBoost's custom build backend doesn't support config_settings option "
+            f"when building sdist. {config_settings=}"
+        )
+
+    cpp_src_dir = TOPLEVEL_DIR.parent
+    if not cpp_src_dir.joinpath("CMakeLists.txt").exists():
+        raise RuntimeError(f"Did not find CMakeLists.txt from {cpp_src_dir}")
+
+    # Create tempdir with Python package + C++ sources
+    with tempfile.TemporaryDirectory() as td:
+        td_path = pathlib.Path(td)
+
+        workspace = td_path / "sdist_workspace"
+        workspace.mkdir()
+        logger.info("Copying project files to temporary directory %s", str(workspace))
+
+        copy_with_logging(TOPLEVEL_DIR / "pyproject.toml", workspace, logger=logger)
+        copy_with_logging(TOPLEVEL_DIR / "hatch_build.py", workspace, logger=logger)
+        copy_with_logging(TOPLEVEL_DIR / "README.rst", workspace, logger=logger)
+
+        copytree_with_logging(
+            TOPLEVEL_DIR / "xgboost", workspace / "xgboost", logger=logger
+        )
+        copytree_with_logging(
+            TOPLEVEL_DIR / "packager", workspace / "packager", logger=logger
+        )
+
+        temp_cpp_src_dir = workspace / "cpp_src"
+        copy_cpp_src_tree(cpp_src_dir, target_dir=temp_cpp_src_dir, logger=logger)
+
+        with cd(workspace):
+            sdist_name = hatchling.build.build_sdist(sdist_directory, config_settings)
+    return sdist_name
+
+
+def build_editable(
+    wheel_directory: str,
+    config_settings: Optional[Dict[str, Any]] = None,
+    metadata_directory: Optional[str] = None,
+) -> str:
+    """Build an editable installation. We mostly delegate to Hatchling."""
+    logger = logging.getLogger("xgboost.packager.build_editable")
+
+    if config_settings:
+        raise NotImplementedError(
+            "XGBoost's custom build backend doesn't support config_settings option "
+            f"when building editable installation. {config_settings=}"
+        )
+
+    if locate_local_libxgboost(TOPLEVEL_DIR, logger=logger) is None:
+        raise RuntimeError(
+            "To use the editable installation, first build libxgboost with CMake. "
+            "See https://xgboost.readthedocs.io/en/latest/build.html for detailed instructions."
+        )
+
+    return hatchling.build.build_editable(
+        wheel_directory, config_settings, metadata_directory
+    )
diff --git a/python-package/packager/sdist.py b/python-package/packager/sdist.py
new file mode 100644
index 000000000..af9fbca0d
--- /dev/null
+++ b/python-package/packager/sdist.py
@@ -0,0 +1,27 @@
+"""
+Functions for building sdist
+"""
+import logging
+import pathlib
+
+from .util import copy_with_logging, copytree_with_logging
+
+
+def copy_cpp_src_tree(
+    cpp_src_dir: pathlib.Path, target_dir: pathlib.Path, logger: logging.Logger
+) -> None:
+    """Copy C++ source tree into build directory"""
+
+    for subdir in [
+        "src",
+        "include",
+        "dmlc-core",
+        "gputreeshap",
+        "rabit",
+        "cmake",
+        "plugin",
+    ]:
+        copytree_with_logging(cpp_src_dir / subdir, target_dir / subdir, logger=logger)
+
+    for filename in ["CMakeLists.txt", "LICENSE"]:
+        copy_with_logging(cpp_src_dir.joinpath(filename), target_dir, logger=logger)
diff --git a/python-package/packager/util.py b/python-package/packager/util.py
new file mode 100644
index 000000000..0fff062d7
--- /dev/null
+++ b/python-package/packager/util.py
@@ -0,0 +1,25 @@
+"""
+Utility functions for implementing PEP 517 backend
+"""
+import logging
+import pathlib
+import shutil
+
+
+def copytree_with_logging(
+    src: pathlib.Path, dest: pathlib.Path, logger: logging.Logger
+) -> None:
+    """Call shutil.copytree() with logging"""
+    logger.info("Copying %s -> %s", str(src), str(dest))
+    shutil.copytree(src, dest)
+
+
+def copy_with_logging(
+    src: pathlib.Path, dest: pathlib.Path, logger: logging.Logger
+) -> None:
+    """Call shutil.copy() with logging"""
+    if dest.is_dir():
+        logger.info("Copying %s -> %s", str(src), str(dest / src.name))
+    else:
+        logger.info("Copying %s -> %s", str(src), str(dest))
+    shutil.copy(src, dest)
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
new file mode 100644
index 000000000..8f120df5d
--- /dev/null
+++ b/python-package/pyproject.toml
@@ -0,0 +1,42 @@
+[build-system]
+requires = [
+    "hatchling>=1.12.1"
+]
+backend-path = ["."]
+build-backend = "packager.pep517"
+
+[project]
+name = "xgboost"
+version = "2.0.0-dev"
+authors = [
+    {name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu"},
+    {name = "Jiaming Yuan", email = "jm.yuan@outlook.com"}
+]
+description = "XGBoost Python Package"
+readme = {file = "README.rst", content-type = "text/x-rst"}
+requires-python = ">=3.8"
+license = {text = "Apache-2.0"}
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Development Status :: 5 - Production/Stable",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10"
+]
+dependencies = [
+    "numpy",
+    "scipy"
+]
+
+[project.optional-dependencies]
+pandas = ["pandas"]
+scikit-learn = ["scikit-learn"]
+dask = ["dask", "pandas", "distributed"]
+datatable = ["datatable"]
+plotting = ["graphviz", "matplotlib"]
+pyspark = ["pyspark", "scikit-learn", "cloudpickle"]
+
+[tool.hatch.build.targets.wheel.hooks.custom]
diff --git a/python-package/setup.py b/python-package/setup.py
deleted file mode 100644
index fe1cbf2e9..000000000
--- a/python-package/setup.py
+++ /dev/null
@@ -1,399 +0,0 @@
-"""Setup xgboost package."""
-import logging
-import os
-import shutil
-import subprocess
-import sys
-from platform import system
-from typing import List, Optional
-
-from setuptools import Extension, find_packages, setup
-from setuptools.command import build_ext, install, install_lib, sdist
-
-# You can't use `pip install .` as pip copies setup.py to a temporary
-# directory, parent directory is no longer reachable (isolated build) .
-CURRENT_DIR = os.path.abspath(os.path.dirname(__file__))
-sys.path.insert(0, CURRENT_DIR)
-
-# Options only effect `python setup.py install`, building `bdist_wheel`
-# requires using CMake directly.
-USER_OPTIONS = {
-    # libxgboost options.
-    "use-openmp": (None, "Build with OpenMP support.", 1),
-    "use-cuda": (None, "Build with GPU acceleration.", 0),
-    "use-nccl": (None, "Build with NCCL to enable distributed GPU support.", 0),
-    "build-with-shared-nccl": (None, "Build with shared NCCL library.", 0),
-    "hide-cxx-symbols": (None, "Hide all C++ symbols during build.", 1),
-    "use-hdfs": (None, "Build with HDFS support", 0),
-    "use-azure": (None, "Build with AZURE support.", 0),
-    "use-s3": (None, "Build with S3 support", 0),
-    "plugin-dense-parser": (None, "Build dense parser plugin.", 0),
-    # Python specific
-    "use-system-libxgboost": (None, "Use libxgboost.so in system path.", 0),
-}
-
-NEED_CLEAN_TREE = set()
-NEED_CLEAN_FILE = set()
-BUILD_TEMP_DIR = None
-
-
-def lib_name() -> str:
-    """Return platform dependent shared object name."""
-    if system() == "Linux" or system().upper().endswith("BSD"):
-        name = "libxgboost.so"
-    elif system() == "Darwin":
-        name = "libxgboost.dylib"
-    elif system() == "Windows":
-        name = "xgboost.dll"
-    elif system() == "OS400":
-        name = "libxgboost.so"
-    return name
-
-
-def copy_tree(src_dir: str, target_dir: str) -> None:
-    """Copy source tree into build directory."""
-
-    def clean_copy_tree(src: str, dst: str) -> None:
-        shutil.copytree(src, dst)
-        NEED_CLEAN_TREE.add(os.path.abspath(dst))
-
-    def clean_copy_file(src: str, dst: str) -> None:
-        shutil.copy(src, dst)
-        NEED_CLEAN_FILE.add(os.path.abspath(dst))
-
-    src = os.path.join(src_dir, "src")
-    inc = os.path.join(src_dir, "include")
-    dmlc_core = os.path.join(src_dir, "dmlc-core")
-    gputreeshap = os.path.join(src_dir, "gputreeshap")
-    rabit = os.path.join(src_dir, "rabit")
-    cmake = os.path.join(src_dir, "cmake")
-    plugin = os.path.join(src_dir, "plugin")
-
-    clean_copy_tree(src, os.path.join(target_dir, "src"))
-    clean_copy_tree(inc, os.path.join(target_dir, "include"))
-    clean_copy_tree(dmlc_core, os.path.join(target_dir, "dmlc-core"))
-    clean_copy_tree(gputreeshap, os.path.join(target_dir, "gputreeshap"))
-    clean_copy_tree(rabit, os.path.join(target_dir, "rabit"))
-    clean_copy_tree(cmake, os.path.join(target_dir, "cmake"))
-    clean_copy_tree(plugin, os.path.join(target_dir, "plugin"))
-
-    cmake_list = os.path.join(src_dir, "CMakeLists.txt")
-    clean_copy_file(cmake_list, os.path.join(target_dir, "CMakeLists.txt"))
-    lic = os.path.join(src_dir, "LICENSE")
-    clean_copy_file(lic, os.path.join(target_dir, "LICENSE"))
-
-
-def clean_up() -> None:
-    """Removed copied files."""
-    for path in NEED_CLEAN_TREE:
-        shutil.rmtree(path)
-    for path in NEED_CLEAN_FILE:
-        os.remove(path)
-
-
-class CMakeExtension(Extension):  # pylint: disable=too-few-public-methods
-    """Wrapper for extension"""
-
-    def __init__(self, name: str) -> None:
-        super().__init__(name=name, sources=[])
-
-
-class BuildExt(build_ext.build_ext):  # pylint: disable=too-many-ancestors
-    """Custom build_ext command using CMake."""
-
-    logger = logging.getLogger("XGBoost build_ext")
-
-    # pylint: disable=too-many-arguments
-    def build(
-        self,
-        src_dir: str,
-        build_dir: str,
-        generator: str,
-        build_tool: Optional[str] = None,
-        use_omp: int = 1,
-    ) -> None:
-        """Build the core library with CMake."""
-        cmake_cmd = ["cmake", src_dir, generator]
-
-        for k, v in USER_OPTIONS.items():
-            arg = k.replace("-", "_").upper()
-            value = str(v[2])
-            if arg == "USE_SYSTEM_LIBXGBOOST":
-                continue
-            if arg == "USE_OPENMP" and use_omp == 0:
-                cmake_cmd.append("-D" + arg + "=0")
-                continue
-            cmake_cmd.append("-D" + arg + "=" + value)
-
-        # Flag for cross-compiling for Apple Silicon
-        # We use environment variable because it's the only way to pass down custom flags
-        # through the cibuildwheel package, which otherwise calls `python setup.py bdist_wheel`
-        # command.
-        if "CIBW_TARGET_OSX_ARM64" in os.environ:
-            cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")
-
-        self.logger.info("Run CMake command: %s", str(cmake_cmd))
-        subprocess.check_call(cmake_cmd, cwd=build_dir)
-
-        if system() != "Windows":
-            nproc = os.cpu_count()
-            assert build_tool is not None
-            subprocess.check_call([build_tool, "-j" + str(nproc)], cwd=build_dir)
-        else:
-            subprocess.check_call(
-                ["cmake", "--build", ".", "--config", "Release"], cwd=build_dir
-            )
-
-    def build_cmake_extension(self) -> None:
-        """Configure and build using CMake"""
-        if USER_OPTIONS["use-system-libxgboost"][2]:
-            self.logger.info("Using system libxgboost.")
-            return
-
-        build_dir = self.build_temp
-        global BUILD_TEMP_DIR  # pylint: disable=global-statement
-        BUILD_TEMP_DIR = build_dir
-        libxgboost = os.path.abspath(
-            os.path.join(CURRENT_DIR, os.path.pardir, "lib", lib_name())
-        )
-
-        if os.path.exists(libxgboost):
-            self.logger.info("Found shared library, skipping build.")
-            return
-
-        src_dir = "xgboost"
-        try:
-            copy_tree(
-                os.path.join(CURRENT_DIR, os.path.pardir),
-                os.path.join(self.build_temp, src_dir),
-            )
-        except Exception:  # pylint: disable=broad-except
-            copy_tree(src_dir, os.path.join(self.build_temp, src_dir))
-
-        self.logger.info("Building from source. %s", libxgboost)
-        if not os.path.exists(build_dir):
-            os.mkdir(build_dir)
-        if shutil.which("ninja"):
-            build_tool = "ninja"
-        else:
-            build_tool = "make"
-        if sys.platform.startswith("os400"):
-            build_tool = "make"
-
-        if system() == "Windows":
-            # Pick up from LGB, just test every possible tool chain.
-            for vs in (
-                "-GVisual Studio 17 2022",
-                "-GVisual Studio 16 2019",
-                "-GVisual Studio 15 2017",
-                "-GVisual Studio 14 2015",
-                "-GMinGW Makefiles",
-            ):
-                try:
-                    self.build(src_dir, build_dir, vs)
-                    self.logger.info(
-                        "%s is used for building Windows distribution.", vs
-                    )
-                    break
-                except subprocess.CalledProcessError:
-                    shutil.rmtree(build_dir)
-                    os.mkdir(build_dir)
-                    continue
-        else:
-            gen = "-GNinja" if build_tool == "ninja" else "-GUnix Makefiles"
-            try:
-                self.build(src_dir, build_dir, gen, build_tool, use_omp=1)
-            except subprocess.CalledProcessError:
-                self.logger.warning("Disabling OpenMP support.")
-                self.build(src_dir, build_dir, gen, build_tool, use_omp=0)
-
-    def build_extension(self, ext: Extension) -> None:
-        """Override the method for dispatching."""
-        if isinstance(ext, CMakeExtension):
-            self.build_cmake_extension()
-        else:
-            super().build_extension(ext)
-
-    def copy_extensions_to_source(self) -> None:
-        """Dummy override.  Invoked during editable installation.  Our binary
-        should available in `lib`.
-
-        """
-        if not os.path.exists(
-            os.path.join(CURRENT_DIR, os.path.pardir, "lib", lib_name())
-        ):
-            raise ValueError(
-                "For using editable installation, please "
-                + "build the shared object first with CMake."
-            )
-
-
-class Sdist(sdist.sdist):  # pylint: disable=too-many-ancestors
-    """Copy c++ source into Python directory."""
-
-    logger = logging.getLogger("xgboost sdist")
-
-    def run(self) -> None:
-        copy_tree(
-            os.path.join(CURRENT_DIR, os.path.pardir),
-            os.path.join(CURRENT_DIR, "xgboost"),
-        )
-        libxgboost = os.path.join(CURRENT_DIR, os.path.pardir, "lib", lib_name())
-        if os.path.exists(libxgboost):
-            self.logger.warning(
-                "Found shared library, removing to avoid being included in source distribution."
-            )
-            os.remove(libxgboost)
-        super().run()
-
-
-class InstallLib(install_lib.install_lib):
-    """Copy shared object into installation directory."""
-
-    logger = logging.getLogger("xgboost install_lib")
-
-    def install(self) -> List[str]:
-        outfiles = super().install()
-
-        if USER_OPTIONS["use-system-libxgboost"][2] != 0:
-            self.logger.info("Using system libxgboost.")
-            lib_path = os.path.join(sys.prefix, "lib")
-            msg = (
-                "use-system-libxgboost is specified, but "
-                + lib_name()
-                + " is not found in: "
-                + lib_path
-            )
-            assert os.path.exists(os.path.join(lib_path, lib_name())), msg
-            return []
-
-        lib_dir = os.path.join(self.install_dir, "xgboost", "lib")
-        if not os.path.exists(lib_dir):
-            os.mkdir(lib_dir)
-        dst = os.path.join(self.install_dir, "xgboost", "lib", lib_name())
-
-        libxgboost_path = lib_name()
-
-        assert BUILD_TEMP_DIR is not None
-        dft_lib_dir = os.path.join(CURRENT_DIR, os.path.pardir, "lib")
-        build_dir = os.path.join(BUILD_TEMP_DIR, "xgboost", "lib")
-
-        if os.path.exists(os.path.join(dft_lib_dir, libxgboost_path)):
-            # The library is built by CMake directly
-            src = os.path.join(dft_lib_dir, libxgboost_path)
-        else:
-            # The library is built by setup.py
-            src = os.path.join(build_dir, libxgboost_path)
-        self.logger.info("Installing shared library: %s", src)
-        dst, _ = self.copy_file(src, dst)
-        outfiles.append(dst)
-        return outfiles
-
-
-class Install(install.install):  # pylint: disable=too-many-instance-attributes
-    """An interface to install command, accepting XGBoost specific
-    arguments.
-
-    """
-
-    user_options = install.install.user_options + [
-        (k, v[0], v[1]) for k, v in USER_OPTIONS.items()
-    ]
-
-    def initialize_options(self) -> None:
-        super().initialize_options()
-        self.use_openmp = 1
-        self.use_cuda = 0
-        self.use_nccl = 0
-        self.build_with_shared_nccl = 0
-        self.hide_cxx_symbols = 1
-
-        self.use_hdfs = 0
-        self.use_azure = 0
-        self.use_s3 = 0
-
-        self.plugin_dense_parser = 0
-
-        self.use_system_libxgboost = 0
-
-    def run(self) -> None:
-        # setuptools will configure the options according to user supplied command line
-        # arguments, then here we propagate them into `USER_OPTIONS` for visibility to
-        # other sub-commands like `build_ext`.
-        for k, v in USER_OPTIONS.items():
-            arg = k.replace("-", "_")
-            if hasattr(self, arg):
-                USER_OPTIONS[k] = (v[0], v[1], getattr(self, arg))
-        super().run()
-
-
-if __name__ == "__main__":
-    # Supported commands:
-    # From internet:
-    # - pip install xgboost
-    # - pip install --no-binary :all: xgboost
-
-    # From source tree `xgboost/python-package`:
-    # - python setup.py build
-    # - python setup.py build_ext
-    # - python setup.py install
-    # - python setup.py sdist       && pip install <sdist-name>
-    # - python setup.py bdist_wheel && pip install <wheel-name>
-
-    # When XGBoost is compiled directly with CMake:
-    # - pip install -e .
-    # - python setup.py develop   # same as above
-    logging.basicConfig(level=logging.INFO)
-
-    with open(os.path.join(CURRENT_DIR, "README.rst"), encoding="utf-8") as fd:
-        description = fd.read()
-    with open(os.path.join(CURRENT_DIR, "xgboost/VERSION"), encoding="ascii") as fd:
-        version = fd.read().strip()
-
-    setup(
-        name="xgboost",
-        version=version,
-        description="XGBoost Python Package",
-        long_description=description,
-        long_description_content_type="text/x-rst",
-        install_requires=[
-            "numpy",
-            "scipy",
-        ],
-        ext_modules=[CMakeExtension("libxgboost")],
-        # error: expected "str": "Type[Command]"
-        cmdclass={
-            "build_ext": BuildExt,  # type: ignore
-            "sdist": Sdist,  # type: ignore
-            "install_lib": InstallLib,  # type: ignore
-            "install": Install,  # type: ignore
-        },
-        extras_require={
-            "pandas": ["pandas"],
-            "scikit-learn": ["scikit-learn"],
-            "dask": ["dask", "pandas", "distributed"],
-            "datatable": ["datatable"],
-            "plotting": ["graphviz", "matplotlib"],
-            "pyspark": ["pyspark", "scikit-learn", "cloudpickle"],
-        },
-        maintainer="Hyunsu Cho",
-        maintainer_email="chohyu01@cs.washington.edu",
-        zip_safe=False,
-        packages=find_packages(),
-        include_package_data=True,
-        license="Apache-2.0",
-        classifiers=[
-            "License :: OSI Approved :: Apache Software License",
-            "Development Status :: 5 - Production/Stable",
-            "Operating System :: OS Independent",
-            "Programming Language :: Python",
-            "Programming Language :: Python :: 3",
-            "Programming Language :: Python :: 3.8",
-            "Programming Language :: Python :: 3.9",
-            "Programming Language :: Python :: 3.10",
-        ],
-        python_requires=">=3.8",
-        url="https://github.com/dmlc/xgboost",
-    )
-
-    clean_up()
diff --git a/python-package/xgboost/config.py b/python-package/xgboost/config.py
index c08a13150..1691d473f 100644
--- a/python-package/xgboost/config.py
+++ b/python-package/xgboost/config.py
@@ -16,7 +16,7 @@ def config_doc(
     extra_note: Optional[str] = None,
     parameters: Optional[str] = None,
     returns: Optional[str] = None,
-    see_also: Optional[str] = None
+    see_also: Optional[str] = None,
 ) -> Callable[[_F], _F]:
     """Decorator to format docstring for config functions.
 
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
index 71058e8c9..d9eb14d0f 100644
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -30,7 +30,7 @@ def plot_importance(
     grid: bool = True,
     show_values: bool = True,
     values_format: str = "{v}",
-    **kwargs: Any
+    **kwargs: Any,
 ) -> Axes:
     """Plot importance based on fitted trees.
 
@@ -155,7 +155,7 @@ def to_graphviz(
     no_color: Optional[str] = None,
     condition_node_params: Optional[dict] = None,
     leaf_node_params: Optional[dict] = None,
-    **kwargs: Any
+    **kwargs: Any,
 ) -> GraphvizSource:
     """Convert specified tree to graphviz instance. IPython can automatically plot
     the returned graphviz instance. Otherwise, you should call .render() method
@@ -250,7 +250,7 @@ def plot_tree(
     num_trees: int = 0,
     rankdir: Optional[str] = None,
     ax: Optional[Axes] = None,
-    **kwargs: Any
+    **kwargs: Any,
 ) -> Axes:
     """Plot specified tree.
 
diff --git a/tests/buildkite/build-cpu-arm64.sh b/tests/buildkite/build-cpu-arm64.sh
index 1a95a880a..fd00a7971 100755
--- a/tests/buildkite/build-cpu-arm64.sh
+++ b/tests/buildkite/build-cpu-arm64.sh
@@ -18,7 +18,7 @@ $command_wrapper bash -c "cd build && ctest --extra-verbose"
 
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
-  "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
+  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
 $command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
   ${BUILDKITE_COMMIT} ${WHEEL_TAG}
 
diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh
index b25345b1b..c180695e8 100755
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -27,7 +27,7 @@ $command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc
   -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
 echo "--- Build binary wheel"
 $command_wrapper bash -c \
-  "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
+  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
 $command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
   ${BUILDKITE_COMMIT} ${WHEEL_TAG}
 
diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1
index 05d7aefb9..32cd2806a 100644
--- a/tests/buildkite/build-win64-gpu.ps1
+++ b/tests/buildkite/build-win64-gpu.ps1
@@ -24,21 +24,17 @@ if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 Write-Host "--- Build binary wheel"
 cd ../python-package
 conda activate
-& python setup.py bdist_wheel --universal
+& pip install --user -v "pip>=23"
+& pip --version
+& pip wheel --no-deps -v . --wheel-dir dist/
 Get-ChildItem . -Filter dist/*.whl |
 Foreach-Object {
   & python ../tests/ci_build/rename_whl.py $_.FullName $Env:BUILDKITE_COMMIT win_amd64
   if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 }
 
-Write-Host "--- Insert vcomp140.dll (OpenMP runtime) into the wheel"
-cd dist
-Copy-Item -Path ../../tests/ci_build/insert_vcomp140.py -Destination .
-& python insert_vcomp140.py *.whl
-if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
-
 Write-Host "--- Upload Python wheel"
-cd ../..
+cd ..
 Get-ChildItem . -Filter python-package/dist/*.whl |
 Foreach-Object {
   & buildkite-agent artifact upload python-package/dist/$_
diff --git a/tests/ci_build/build_python_wheels.sh b/tests/ci_build/build_python_wheels.sh
index d91df2286..205b3b695 100644
--- a/tests/ci_build/build_python_wheels.sh
+++ b/tests/ci_build/build_python_wheels.sh
@@ -26,7 +26,7 @@ if [[ "$platform_id" == macosx_* ]]; then
         # cibuildwheel will take care of cross-compilation.
         wheel_tag=macosx_12_0_arm64
         cpython_ver=38
-        setup_env_var='CIBW_TARGET_OSX_ARM64=1'  # extra flag to be passed to setup.py
+        setup_env_var='CIBW_TARGET_OSX_ARM64=1'  # extra flag to be passed to xgboost.packager backend
         export PYTHON_CROSSENV=1
         export MACOSX_DEPLOYMENT_TARGET=12.0
         #OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-arm64/llvm-openmp-11.1.0-hf3c4609_1.tar.bz2"
diff --git a/tests/ci_build/change_version.py b/tests/ci_build/change_version.py
index 62cb894dc..25561859c 100644
--- a/tests/ci_build/change_version.py
+++ b/tests/ci_build/change_version.py
@@ -40,14 +40,24 @@ def pypkg(
     major: int, minor: int, patch: int, rc: int, is_rc: bool, is_dev: bool
 ) -> None:
     version = f"{major}.{minor}.{patch}"
-    pyver_path = os.path.join("xgboost", "VERSION")
     pyver = version
     if is_rc:
         pyver = pyver + f"rc{rc}"
     if is_dev:
         pyver = pyver + "-dev"
+
+    pyver_path = os.path.join("xgboost", "VERSION")
     with open(pyver_path, "w") as fd:
-        fd.write(pyver)
+        fd.write(pyver + "\n")
+
+    pyprj_path = os.path.join("pyproject.toml")
+    with open(pyprj_path, "r") as fd:
+        pyprj = fd.read()
+    matched = re.search('version = "' + r"([0-9]+\.[0-9]+\.[0-9]+.*)" + '"', pyprj)
+    assert matched, "Couldn't find version string in pyproject.toml."
+    pyprj = pyprj[: matched.start(1)] + pyver + pyprj[matched.end(1) :]
+    with open(pyprj_path, "w") as fd:
+        fd.write(pyprj)
 
 
 @cd(R_PACKAGE)
diff --git a/tests/ci_build/conda_env/python_lint.yml b/tests/ci_build/conda_env/python_lint.yml
index a64f649a2..3d42dfaf3 100644
--- a/tests/ci_build/conda_env/python_lint.yml
+++ b/tests/ci_build/conda_env/python_lint.yml
@@ -18,6 +18,7 @@ dependencies:
 - cloudpickle
 - pytest
 - hypothesis
+- hatchling
 - pip:
   # TODO: Replace it with pyspark>=3.4 once 3.4 released.
   - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
diff --git a/tests/ci_build/conda_env/sdist_test.yml b/tests/ci_build/conda_env/sdist_test.yml
index acc4607ad..67a9324f7 100644
--- a/tests/ci_build/conda_env/sdist_test.yml
+++ b/tests/ci_build/conda_env/sdist_test.yml
@@ -8,5 +8,6 @@ dependencies:
 - wheel
 - cmake
 - ninja
+- python-build
 - c-compiler
 - cxx-compiler
diff --git a/tests/ci_build/insert_vcomp140.py b/tests/ci_build/insert_vcomp140.py
deleted file mode 100644
index cfa8d792d..000000000
--- a/tests/ci_build/insert_vcomp140.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import argparse
-import base64
-import glob
-import hashlib
-import os
-import pathlib
-import re
-import shutil
-import tempfile
-
-VCOMP140_PATH = "C:\\Windows\\System32\\vcomp140.dll"
-
-
-def get_sha256sum(path):
-    return (
-        base64.urlsafe_b64encode(hashlib.sha256(open(path, "rb").read()).digest())
-        .decode("latin1")
-        .rstrip("=")
-    )
-
-
-def update_record(*, wheel_content_dir, xgboost_version):
-    vcomp140_size = os.path.getsize(VCOMP140_PATH)
-    vcomp140_hash = get_sha256sum(VCOMP140_PATH)
-
-    record_path = wheel_content_dir / pathlib.Path(
-        f"xgboost-{xgboost_version}.dist-info/RECORD"
-    )
-    with open(record_path, "r") as f:
-        record_content = f.read()
-    record_content += f"xgboost-{xgboost_version}.data/data/xgboost/vcomp140.dll,"
-    record_content += f"sha256={vcomp140_hash},{vcomp140_size}\n"
-    with open(record_path, "w") as f:
-        f.write(record_content)
-
-
-def main(args):
-    candidates = list(sorted(glob.glob(args.wheel_path)))
-    for wheel_path in candidates:
-        print(f"Processing wheel {wheel_path}")
-        m = re.search(r"xgboost-(.*)\+.*-py3", wheel_path)
-        if not m:
-            raise ValueError(f"Wheel {wheel_path} has unexpected name")
-        version = m.group(1)
-        print(f"  Detected version for {wheel_path}: {version}")
-        print(f"  Inserting vcomp140.dll into {wheel_path}...")
-        with tempfile.TemporaryDirectory() as tempdir:
-            wheel_content_dir = pathlib.Path(tempdir) / "wheel_content"
-            print(f"    Extract {wheel_path} into {wheel_content_dir}")
-            shutil.unpack_archive(
-                wheel_path, extract_dir=wheel_content_dir, format="zip"
-            )
-            data_dir = wheel_content_dir / pathlib.Path(
-                f"xgboost-{version}.data/data/xgboost"
-            )
-            data_dir.mkdir(parents=True, exist_ok=True)
-
-            print(f"    Copy {VCOMP140_PATH} -> {data_dir}")
-            shutil.copy(VCOMP140_PATH, data_dir)
-
-            print(f"    Update RECORD")
-            update_record(wheel_content_dir=wheel_content_dir, xgboost_version=version)
-
-            print(f"    Content of {wheel_content_dir}:")
-            for e in sorted(wheel_content_dir.rglob("*")):
-                if e.is_file():
-                    r = e.relative_to(wheel_content_dir)
-                    print(f"      {r}")
-
-            print(f"    Create new wheel...")
-            new_wheel_tmp_path = pathlib.Path(tempdir) / "new_wheel"
-            shutil.make_archive(
-                str(new_wheel_tmp_path.resolve()),
-                format="zip",
-                root_dir=wheel_content_dir,
-            )
-            new_wheel_tmp_path = new_wheel_tmp_path.resolve().with_suffix(".zip")
-            new_wheel_tmp_path = new_wheel_tmp_path.rename(
-                new_wheel_tmp_path.with_suffix(".whl")
-            )
-            print(f"    Created new wheel {new_wheel_tmp_path}")
-
-            # Rename the old wheel with suffix .bak
-            # The new wheel takes the name of the old wheel
-            wheel_path_obj = pathlib.Path(wheel_path).resolve()
-            backup_path = wheel_path_obj.with_suffix(".whl.bak")
-            print(f"    Rename {wheel_path_obj} -> {backup_path}")
-            wheel_path_obj.replace(backup_path)
-            print(f"    Rename {new_wheel_tmp_path} -> {wheel_path_obj}")
-            new_wheel_tmp_path.replace(wheel_path_obj)
-
-            shutil.rmtree(wheel_content_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "wheel_path", type=str, help="Path to wheel (wildcard permitted)"
-    )
-    args = parser.parse_args()
-
-    main(args)
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 00791e19d..3f553da9f 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -198,7 +198,7 @@ def main(args: argparse.Namespace) -> None:
             run_mypy(path)
             for path in [
                 # core
-                "python-package/xgboost/",
+                "python-package/",
                 # demo
                 "demo/json-model/json_parser.py",
                 "demo/guide-python/external_memory.py",
diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh
index 7375b4c9f..a70b27961 100755
--- a/tests/ci_build/test_python.sh
+++ b/tests/ci_build/test_python.sh
@@ -28,7 +28,7 @@ function install_xgboost {
   then
     pushd .
     cd python-package
-    python setup.py install --user
+    pip install --user -v .
     popd
   fi
 }

From b908680bec6981ff7d9c394ec625671400c7ce6c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 21 Apr 2023 05:24:10 +0800
Subject: [PATCH 09/34] Fix race condition in cpp metric tests. (#9058)

---
 src/metric/elementwise_metric.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index b1c764047..01aec16e1 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -217,7 +217,7 @@ struct EvalError {
     }
   }
   const char *Name() const {
-    static std::string name;
+    static thread_local std::string name;
     if (has_param_) {
       std::ostringstream os;
       os << "error";
@@ -315,7 +315,7 @@ struct EvalTweedieNLogLik {
         << "tweedie variance power must be in interval [1, 2)";
   }
   const char *Name() const {
-    static std::string name;
+    static thread_local std::string name;
     std::ostringstream os;
     os << "tweedie-nloglik@" << rho_;
     name = os.str();

From ee84e22c8df51c246b75dd7865869b37bf852181 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 21 Apr 2023 18:16:08 +0800
Subject: [PATCH 10/34] Bump maven-checkstyle-plugin from 3.2.1 to 3.2.2 in
 /jvm-packages (#9073)

Bumps [maven-checkstyle-plugin](https://github.com/apache/maven-checkstyle-plugin) from 3.2.1 to 3.2.2.
- [Release notes](https://github.com/apache/maven-checkstyle-plugin/releases)
- [Commits](https://github.com/apache/maven-checkstyle-plugin/compare/maven-checkstyle-plugin-3.2.1...maven-checkstyle-plugin-3.2.2)

---
updated-dependencies:
- dependency-name: org.apache.maven.plugins:maven-checkstyle-plugin
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 8a6032984..ea15515de 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -374,7 +374,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-checkstyle-plugin</artifactId>
-                <version>3.2.1</version>
+                <version>3.2.2</version>
                 <configuration>
                     <configLocation>checkstyle.xml</configLocation>
                     <failOnViolation>true</failOnViolation>

From 39b0fde0e7a3953d40e15619f45cc3227afdac89 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 21 Apr 2023 18:16:34 +0800
Subject: [PATCH 11/34] Bump kryo from 5.4.0 to 5.5.0 in /jvm-packages (#9070)

Bumps [kryo](https://github.com/EsotericSoftware/kryo) from 5.4.0 to 5.5.0.
- [Release notes](https://github.com/EsotericSoftware/kryo/releases)
- [Commits](https://github.com/EsotericSoftware/kryo/compare/kryo-parent-5.4.0...kryo-parent-5.5.0)

---
updated-dependencies:
- dependency-name: com.esotericsoftware:kryo
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index ea15515de..61fd75d75 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -469,7 +469,7 @@
         <dependency>
             <groupId>com.esotericsoftware</groupId>
             <artifactId>kryo</artifactId>
-            <version>5.4.0</version>
+            <version>5.5.0</version>
         </dependency>
         <dependency>
             <groupId>org.scala-lang</groupId>

From 3b742dc4f179d5bf349c992c26655e69e7c0c0ab Mon Sep 17 00:00:00 2001
From: austinzh <austin.zhang79@gmail.com>
Date: Fri, 21 Apr 2023 07:38:07 -0400
Subject: [PATCH 12/34] Stop using Rabit in predition (#9054)

---
 .../xgboost4j/scala/spark/PreXGBoost.scala    | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
index 176a54832..31d58224b 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021-2022 by Contributors
+ Copyright (c) 2021-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@ import java.util.ServiceLoader
 import scala.collection.JavaConverters._
 import scala.collection.{AbstractIterator, Iterator, mutable}
 
-import ml.dmlc.xgboost4j.java.Communicator
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
 import ml.dmlc.xgboost4j.scala.spark.util.DataUtils.PackedParams
 import ml.dmlc.xgboost4j.scala.spark.params.XGBoostEstimatorCommon
@@ -35,7 +34,6 @@ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import org.apache.commons.logging.LogFactory
 
 import org.apache.spark.TaskContext
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}
@@ -263,12 +261,6 @@ object PreXGBoost extends PreXGBoostProvider {
         private var batchCnt = 0
 
         private val batchIterImpl = rowIterator.grouped(inferBatchSize).flatMap { batchRow =>
-          if (batchCnt == 0) {
-            val rabitEnv = Array(
-              "DMLC_TASK_ID" -> TaskContext.getPartitionId().toString).toMap
-            Communicator.init(rabitEnv.asJava)
-          }
-
           val features = batchRow.iterator.map(row => row.getAs[Vector](featuresCol))
 
           import ml.dmlc.xgboost4j.scala.spark.util.DataUtils._
@@ -295,13 +287,8 @@ object PreXGBoost extends PreXGBoostProvider {
 
         override def hasNext: Boolean = batchIterImpl.hasNext
 
-        override def next(): Row = {
-          val ret = batchIterImpl.next()
-          if (!batchIterImpl.hasNext) {
-            Communicator.shutdown()
-          }
-          ret
-        }
+        override def next(): Row = batchIterImpl.next()
+
       }
     }
 

From 7032981350430c9cedbea53cd7049dc4406f2597 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 21 Apr 2023 22:53:58 +0800
Subject: [PATCH 13/34] Fix timer annotation. (#9057)

---
 src/tree/updater_quantile_hist.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 4906a21b7..749e799a8 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -294,6 +294,7 @@ class MultiTargetHistBuilder {
                      std::vector<bst_node_t> *p_out_position) {
     monitor_->Start(__func__);
     if (!task_->UpdateTreeLeaf()) {
+      monitor_->Stop(__func__);
       return;
     }
     for (auto const &part : partitioner_) {
@@ -397,6 +398,7 @@ class HistBuilder {
     evaluator_ = std::make_unique<HistEvaluator<CPUExpandEntry>>(ctx_, this->param_, fmat->Info(),
                                                                  col_sampler_);
     p_last_tree_ = p_tree;
+    monitor_->Stop(__func__);
   }
 
   void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,

From 8dbe0510de6eedf07272468cb61aae8098a7dbd3 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 21 Apr 2023 12:32:05 -0700
Subject: [PATCH 14/34] More collective aggregators (#9060)

---
 src/collective/aggregator.h      | 68 ++++++++++++++++++++++++++++++++
 src/metric/auc.cc                | 34 +++-------------
 src/metric/elementwise_metric.cu | 25 +++++-------
 src/metric/multiclass_metric.cu  |  7 ++--
 src/metric/rank_metric.cc        | 14 ++-----
 src/metric/survival_metric.cu    |  7 ++--
 src/objective/adaptive.h         | 16 +++-----
 src/objective/quantile_obj.cu    |  8 ++--
 src/objective/regression_obj.cu  |  6 +--
 src/tree/fit_stump.cc            |  6 +--
 src/tree/updater_approx.cc       |  5 +--
 11 files changed, 107 insertions(+), 89 deletions(-)

diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index 12222cf9d..fe7b65930 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -8,6 +8,7 @@
 #pragma once
 #include <xgboost/data.h>
 
+#include <limits>
 #include <string>
 #include <utility>
 #include <vector>
@@ -57,5 +58,72 @@ void ApplyWithLabels(MetaInfo const& info, T* buffer, size_t size, Function&& fu
     std::forward<Function>(function)(std::forward<Args>(args)...);
   }
 }
+
+/**
+ * @brief Find the global max of the given value across all workers.
+ *
+ * This only applies when the data is split row-wise (horizontally). When data is split
+ * column-wise (vertically), the local value is returned.
+ *
+ * @tparam T The type of the value.
+ * @param info MetaInfo about the DMatrix.
+ * @param value The input for finding the global max.
+ * @return The global max of the input.
+ */
+template <typename T>
+T GlobalMax(MetaInfo const& info, T value) {
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kMax>(&value, 1);
+  }
+  return value;
+}
+
+/**
+ * @brief Find the global sum of the given values across all workers.
+ *
+ * This only applies when the data is split row-wise (horizontally). When data is split
+ * column-wise (vertically), the original values are returned.
+ *
+ * @tparam T The type of the values.
+ * @param info MetaInfo about the DMatrix.
+ * @param values Pointer to the inputs to sum.
+ * @param size Number of values to sum.
+ */
+template <typename T>
+void GlobalSum(MetaInfo const& info, T* values, size_t size) {
+  if (info.IsRowSplit()) {
+    collective::Allreduce<collective::Operation::kSum>(values, size);
+  }
+}
+
+template <typename Container>
+void GlobalSum(MetaInfo const& info, Container* values) {
+  GlobalSum(info, values->data(), values->size());
+}
+
+/**
+ * @brief Find the global ratio of the given two values across all workers.
+ *
+ * This only applies when the data is split row-wise (horizontally). When data is split
+ * column-wise (vertically), the local ratio is returned.
+ *
+ * @tparam T The type of the values.
+ * @param info MetaInfo about the DMatrix.
+ * @param dividend The dividend of the ratio.
+ * @param divisor The divisor of the ratio.
+ * @return The global ratio of the two inputs.
+ */
+template <typename T>
+T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
+  std::array<T, 2> results{dividend, divisor};
+  GlobalSum(info, &results);
+  std::tie(dividend, divisor) = std::tuple_cat(results);
+  if (divisor <= 0) {
+    return std::numeric_limits<T>::quiet_NaN();
+  } else {
+    return dividend / divisor;
+  }
+}
+
 }  // namespace collective
 }  // namespace xgboost
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index bde3127ed..473f5b02c 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -116,10 +116,7 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
 
   // we have 2 averages going in here, first is among workers, second is among
   // classes. allreduce sums up fp/tp auc for each class.
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(results.Values().data(),
-                                                       results.Values().size());
-  }
+  collective::GlobalSum(info, &results.Values());
   double auc_sum{0};
   double tp_sum{0};
   for (size_t c = 0; c < n_classes; ++c) {
@@ -293,17 +290,8 @@ class EvalAUC : public MetricNoCache {
         InvalidGroupAUC();
       }
 
-      std::array<double, 2> results{auc, static_cast<double>(valid_groups)};
-      if (info.IsRowSplit()) {
-        collective::Allreduce<collective::Operation::kSum>(results.data(), results.size());
-      }
-      auc = results[0];
-      valid_groups = static_cast<uint32_t>(results[1]);
-
-      if (valid_groups <= 0) {
-        auc = std::numeric_limits<double>::quiet_NaN();
-      } else {
-        auc /= valid_groups;
+      auc = collective::GlobalRatio(info, auc, static_cast<double>(valid_groups));
+      if (!std::isnan(auc)) {
         CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups
                          << ", valid groups: " << valid_groups;
       }
@@ -323,19 +311,9 @@ class EvalAUC : public MetricNoCache {
         std::tie(fp, tp, auc) =
             static_cast<Curve *>(this)->EvalBinary(preds, info);
       }
-      double local_area = fp * tp;
-      std::array<double, 2> result{auc, local_area};
-      if (info.IsRowSplit()) {
-        collective::Allreduce<collective::Operation::kSum>(result.data(), result.size());
-      }
-      std::tie(auc, local_area) = common::UnpackArr(std::move(result));
-      if (local_area <= 0) {
-        // the dataset across all workers have only positive or negative sample
-        auc = std::numeric_limits<double>::quiet_NaN();
-      } else {
-        CHECK_LE(auc, local_area);
-        // normalization
-        auc = auc / local_area;
+      auc = collective::GlobalRatio(info, auc, fp * tp);
+      if (!std::isnan(auc)) {
+        CHECK_LE(auc, 1.0);
       }
     }
     if (std::isnan(auc)) {
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index 01aec16e1..bd1b0b2d8 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -8,6 +8,7 @@
  */
 #include <dmlc/registry.h>
 
+#include <array>
 #include <cmath>
 
 #include "../collective/communicator-inl.h"
@@ -197,10 +198,8 @@ class PseudoErrorLoss : public MetricNoCache {
           auto v = common::Sqr(slope) * (std::sqrt((1 + common::Sqr(a / slope))) - 1) * wt;
           return std::make_tuple(v, wt);
         });
-    double dat[2]{result.Residue(), result.Weights()};
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
-    }
+    std::array<double, 2> dat{result.Residue(), result.Weights()};
+    collective::GlobalSum(info, &dat);
     return EvalRowMAPE::GetFinal(dat[0], dat[1]);
   }
 };
@@ -366,10 +365,8 @@ struct EvalEWiseBase : public MetricNoCache {
           return std::make_tuple(residue, wt);
         });
 
-    double dat[2]{result.Residue(), result.Weights()};
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
-    }
+    std::array<double, 2> dat{result.Residue(), result.Weights()};
+    collective::GlobalSum(info, &dat);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -440,10 +437,8 @@ class QuantileError : public MetricNoCache {
     CHECK(!alpha_.Empty());
     if (info.num_row_ == 0) {
       // empty DMatrix on distributed env
-      double dat[2]{0.0, 0.0};
-      if (info.IsRowSplit()) {
-        collective::Allreduce<collective::Operation::kSum>(dat, 2);
-      }
+      std::array<double, 2> dat{0.0, 0.0};
+      collective::GlobalSum(info, &dat);
       CHECK_GT(dat[1], 0);
       return dat[0] / dat[1];
     }
@@ -480,10 +475,8 @@ class QuantileError : public MetricNoCache {
               loss(y_predt(sample_id, quantile_id, target_id), y_true(sample_id, target_id)) * w;
           return std::make_tuple(l, w);
         });
-    double dat[2]{result.Residue(), result.Weights()};
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
-    }
+    std::array<double, 2> dat{result.Residue(), result.Weights()};
+    collective::GlobalSum(info, &dat);
     CHECK_GT(dat[1], 0);
     return dat[0] / dat[1];
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index a1d19dbc8..f6f3f3d04 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -6,6 +6,7 @@
  */
 #include <xgboost/metric.h>
 
+#include <array>
 #include <atomic>
 #include <cmath>
 
@@ -169,7 +170,7 @@ struct EvalMClassBase : public MetricNoCache {
     } else {
       CHECK(preds.Size() % info.labels.Size() == 0) << "label and prediction size not match";
     }
-    double dat[2] { 0.0, 0.0 };
+    std::array<double, 2> dat{0.0, 0.0};
     if (info.labels.Size() != 0) {
       const size_t nclass = preds.Size() / info.labels.Size();
       CHECK_GE(nclass, 1U)
@@ -181,9 +182,7 @@ struct EvalMClassBase : public MetricNoCache {
       dat[0] = result.Residue();
       dat[1] = result.Weights();
     }
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
-    }
+    collective::GlobalSum(info, &dat);
     return Derived::GetFinal(dat[0], dat[1]);
   }
   /*!
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 000b88e80..4f272e939 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -238,14 +238,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
       exc.Rethrow();
     }
 
-    if (collective::IsDistributed() && info.IsRowSplit()) {
-      double dat[2]{sum_metric, static_cast<double>(ngroups)};
-      // approximately estimate the metric using mean
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
-      return dat[0] / dat[1];
-    } else {
-      return sum_metric / ngroups;
-    }
+    return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
   }
 
   const char* Name() const override {
@@ -401,9 +394,8 @@ class EvalRankWithCache : public Metric {
 namespace {
 double Finalize(MetaInfo const& info, double score, double sw) {
   std::array<double, 2> dat{score, sw};
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(dat.data(), dat.size());
-  }
+  collective::GlobalSum(info, &dat);
+  std::tie(score, sw) = std::tuple_cat(dat);
   if (sw > 0.0) {
     score = score / sw;
   }
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 9b1773dc5..5f8c8ee6a 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -7,6 +7,7 @@
 
 #include <dmlc/registry.h>
 
+#include <array>
 #include <memory>
 #include <vector>
 
@@ -211,10 +212,8 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
     auto result = reducer_.Reduce(*ctx_, info.weights_, info.labels_lower_bound_,
                                   info.labels_upper_bound_, preds);
 
-    double dat[2]{result.Residue(), result.Weights()};
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(dat, 2);
-    }
+    std::array<double, 2> dat{result.Residue(), result.Weights()};
+    collective::GlobalSum(info, &dat);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index 7494bceb1..ffd3ddec7 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -6,8 +6,9 @@
 #include <algorithm>
 #include <cstdint>  // std::int32_t
 #include <limits>
-#include <vector>  // std::vector
+#include <vector>   // std::vector
 
+#include "../collective/aggregator.h"
 #include "../collective/communicator-inl.h"
 #include "../common/common.h"
 #include "xgboost/base.h"                // bst_node_t
@@ -41,10 +42,7 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
   auto& quantiles = *p_quantiles;
   auto const& h_node_idx = nidx;
 
-  size_t n_leaf{h_node_idx.size()};
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kMax>(&n_leaf, 1);
-  }
+  size_t n_leaf = collective::GlobalMax(info, h_node_idx.size());
   CHECK(quantiles.empty() || quantiles.size() == n_leaf);
   if (quantiles.empty()) {
     quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@@ -54,16 +52,12 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
   std::vector<int32_t> n_valids(quantiles.size());
   std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
                  [](float q) { return static_cast<int32_t>(!std::isnan(q)); });
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(n_valids.data(), n_valids.size());
-  }
+  collective::GlobalSum(info, &n_valids);
   // convert to 0 for all reduce
   std::replace_if(
       quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
   // use the mean value
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(quantiles.data(), quantiles.size());
-  }
+  collective::GlobalSum(info, &quantiles);
   for (size_t i = 0; i < n_leaf; ++i) {
     if (n_valids[i] > 0) {
       quantiles[i] /= static_cast<float>(n_valids[i]);
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index b6e540b24..b34f37ff9 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -1,6 +1,7 @@
 /**
  * Copyright 2023 by XGBoost contributors
  */
+#include <array>                            // std::array
 #include <cstddef>                          // std::size_t
 #include <cstdint>                          // std::int32_t
 #include <vector>                           // std::vector
@@ -170,10 +171,9 @@ class QuantileRegression : public ObjFunction {
     common::Mean(ctx_, *base_score, &temp);
     double meanq = temp(0) * sw;
 
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
-      collective::Allreduce<collective::Operation::kSum>(&sw, 1);
-    }
+    std::array<double, 2> dat{meanq, sw};
+    collective::GlobalSum(info, &dat);
+    std::tie(meanq, sw) = std::tuple_cat(dat);
     meanq /= (sw + kRtEps);
     base_score->Reshape(1);
     base_score->Data()->Fill(meanq);
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index e0dbb2edc..4c5ed9ec8 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -728,10 +728,8 @@ class MeanAbsoluteError : public ObjFunction {
     std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
                    [w](float v) { return v * w; });
 
-    if (info.IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(out.Values().data(), out.Values().size());
-      collective::Allreduce<collective::Operation::kSum>(&w, 1);
-    }
+    collective::GlobalSum(info, &out.Values());
+    collective::GlobalSum(info, &w, 1);
 
     if (common::CloseTo(w, 0.0)) {
       // Mostly for handling empty dataset test.
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 55f23b329..3533de772 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -8,6 +8,7 @@
 #include <cinttypes>  // std::int32_t
 #include <cstddef>    // std::size_t
 
+#include "../collective/aggregator.h"
 #include "../collective/communicator-inl.h"
 #include "../common/common.h"              // AssertGPUSupport
 #include "../common/numeric.h"             // cpu_impl::Reduce
@@ -45,10 +46,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
   }
   CHECK(h_sum.CContiguous());
 
-  if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
-  }
+  collective::GlobalSum(info, reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
 
   for (std::size_t i = 0; i < h_sum.Size(); ++i) {
     out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index d22e8f679..148614a7e 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -7,6 +7,7 @@
 #include <memory>
 #include <vector>
 
+#include "../collective/aggregator.h"
 #include "../common/random.h"
 #include "../data/gradient_index.h"
 #include "common_row_partitioner.h"
@@ -92,9 +93,7 @@ class GloablApproxBuilder {
     for (auto const &g : gpair) {
       root_sum.Add(g);
     }
-    if (p_fmat->Info().IsRowSplit()) {
-      collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
-    }
+    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
     std::vector<CPUExpandEntry> nodes{best};
     size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes);

From c512c3f46b0cec94d14f6a89d933f9740f66277a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 22 Apr 2023 15:46:44 +0800
Subject: [PATCH 15/34] [jvm-packages] Bump rapids version. (#9056)

---
 jvm-packages/pom.xml                     | 4 ++--
 jvm-packages/xgboost4j-spark-gpu/pom.xml | 7 -------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 61fd75d75..1ab44d977 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -41,8 +41,8 @@
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>22.12.0</cudf.version>
-        <spark.rapids.version>22.12.0</spark.rapids.version>
+        <cudf.version>23.02.0</cudf.version>
+        <spark.rapids.version>23.02.0</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
     </properties>
     <repositories>
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index b1932f3cc..bcb7edb2a 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -44,13 +44,6 @@
             <version>${spark.version}</version>
             <scope>provided</scope>
         </dependency>
-        <dependency>
-          <groupId>ai.rapids</groupId>
-          <artifactId>cudf</artifactId>
-          <version>${cudf.version}</version>
-          <classifier>${cudf.classifier}</classifier>
-          <scope>provided</scope>
-        </dependency>
         <dependency>
           <groupId>com.nvidia</groupId>
           <artifactId>rapids-4-spark_${scala.binary.version}</artifactId>

From d237378452b7fbc70ea84762a13e48062e206a07 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 24 Apr 2023 17:49:08 +0800
Subject: [PATCH 16/34] [jvm-packages] Clean up the dependencies after removing
 scala versioned tracker (#9078)

---
 jvm-packages/pom.xml               |  5 -----
 jvm-packages/xgboost4j-gpu/pom.xml | 12 ------------
 jvm-packages/xgboost4j/pom.xml     | 12 ------------
 3 files changed, 29 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 1ab44d977..550cac42f 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -477,11 +477,6 @@
             <version>${scala.version}</version>
             <scope>provided</scope>
         </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-reflect</artifactId>
-            <version>${scala.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.scala-lang</groupId>
             <artifactId>scala-library</artifactId>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 5ffe0588c..1d7a06708 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -38,18 +38,6 @@
             <version>4.13.2</version>
             <scope>test</scope>
         </dependency>
-        <dependency>
-            <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.6.20</version>
-            <scope>compile</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.6.20</version>
-            <scope>test</scope>
-        </dependency>
         <dependency>
             <groupId>org.scalatest</groupId>
             <artifactId>scalatest_${scala.binary.version}</artifactId>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 7c5c33e87..3a1c4b2cf 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -31,18 +31,6 @@
             <version>4.13.2</version>
             <scope>test</scope>
         </dependency>
-        <dependency>
-            <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-actor_${scala.binary.version}</artifactId>
-            <version>2.6.20</version>
-            <scope>compile</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-testkit_${scala.binary.version}</artifactId>
-            <version>2.6.20</version>
-            <scope>test</scope>
-        </dependency>
         <dependency>
           <groupId>org.scalatest</groupId>
           <artifactId>scalatest_${scala.binary.version}</artifactId>

From 339f21e1bfa1b2d356b6b8318ab7cf9fb4baff36 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 24 Apr 2023 20:04:14 +0800
Subject: [PATCH 17/34] [pyspark] fix a type hint with old pyspark release
 (#9079)

---
 python-package/xgboost/spark/params.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py
index 78a35eee0..7c3231431 100644
--- a/python-package/xgboost/spark/params.py
+++ b/python-package/xgboost/spark/params.py
@@ -1,4 +1,6 @@
 """Xgboost pyspark integration submodule for params."""
+from typing import Dict
+
 # pylint: disable=too-few-public-methods
 from pyspark.ml.param import TypeConverters
 from pyspark.ml.param.shared import Param, Params
@@ -11,7 +13,7 @@ class HasArbitraryParamsDict(Params):
     input.
     """
 
-    arbitrary_params_dict: Param[dict] = Param(
+    arbitrary_params_dict: "Param[Dict]" = Param(
         Params._dummy(),
         "arbitrary_params_dict",
         "arbitrary_params_dict This parameter holds all of the additional parameters which are "

From a2cc78c1fb4a958721b756c54dc40c761973171d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 25 Apr 2023 21:30:38 +0800
Subject: [PATCH 18/34] Bump scala.version from 2.12.8 to 2.12.17 in
 /jvm-packages (#9083)

Bumps `scala.version` from 2.12.8 to 2.12.17.

Updates `scala-compiler` from 2.12.8 to 2.12.17
- [Release notes](https://github.com/scala/scala/releases)
- [Commits](https://github.com/scala/scala/compare/v2.12.8...v2.12.17)

Updates `scala-library` from 2.12.8 to 2.12.17
- [Release notes](https://github.com/scala/scala/releases)
- [Commits](https://github.com/scala/scala/compare/v2.12.8...v2.12.17)

---
updated-dependencies:
- dependency-name: org.scala-lang:scala-compiler
  dependency-type: direct:production
  update-type: version-update:semver-patch
- dependency-name: org.scala-lang:scala-library
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 550cac42f..b758b42c8 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -35,7 +35,7 @@
         <maven.compiler.target>1.8</maven.compiler.target>
         <flink.version>1.8.3</flink.version>
         <spark.version>3.1.1</spark.version>
-        <scala.version>2.12.8</scala.version>
+        <scala.version>2.12.17</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
         <hadoop.version>3.3.5</hadoop.version>
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>

From 17add4776fabc93de2c305f917cc530466e19db3 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Tue, 25 Apr 2023 23:09:12 +0800
Subject: [PATCH 19/34] [pyspark] Don't stack for non feature columns (#9088)

---
 python-package/xgboost/spark/data.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
index f2c5e1197..8f84459d7 100644
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -219,7 +219,9 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
                 array: Optional[np.ndarray] = part[feature_cols]
             elif part[name].shape[0] > 0:
                 array = part[name]
-                array = stack_series(array)
+                if name == alias.data:
+                    # For the array/vector typed case.
+                    array = stack_series(array)
             else:
                 array = None
 

From 49ccae7fb9e10b05132d4bf734513fe9267e932e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 26 Apr 2023 01:32:06 +0800
Subject: [PATCH 20/34] Bump spark.version from 3.1.1 to 3.4.0 in /jvm-packages
 (#9039)

Bumps `spark.version` from 3.1.1 to 3.4.0.

Updates `spark-mllib_2.12` from 3.1.1 to 3.4.0

Updates `spark-core_2.12` from 3.1.1 to 3.4.0

Updates `spark-sql_2.12` from 3.1.1 to 3.4.0

---
updated-dependencies:
- dependency-name: org.apache.spark:spark-mllib_2.12
  dependency-type: direct:production
  update-type: version-update:semver-minor
- dependency-name: org.apache.spark:spark-core_2.12
  dependency-type: direct:production
  update-type: version-update:semver-minor
- dependency-name: org.apache.spark:spark-sql_2.12
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index b758b42c8..2aac8b00c 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -34,7 +34,7 @@
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
         <flink.version>1.8.3</flink.version>
-        <spark.version>3.1.1</spark.version>
+        <spark.version>3.4.0</spark.version>
         <scala.version>2.12.17</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
         <hadoop.version>3.3.5</hadoop.version>

From a320b402a5f25a69d95cb27062be693cd7edcddf Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 25 Apr 2023 12:36:09 -0700
Subject: [PATCH 21/34] More refactoring to take advantage of collective
 aggregators (#9081)

---
 include/xgboost/data.h            |  8 ++++
 src/collective/aggregator.h       | 10 ++---
 src/common/hist_util.cc           | 10 ++---
 src/common/quantile.cc            | 26 ++++++------
 src/common/quantile.h             | 16 ++++----
 src/data/data.cc                  |  4 ++
 src/data/iterative_dmatrix.cc     |  4 +-
 src/objective/adaptive.cc         | 66 +++++++++++++++----------------
 src/objective/quantile_obj.cu     |  2 +-
 tests/cpp/common/test_quantile.cc | 16 ++++----
 10 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 4af306859..fe22fb2b5 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -196,6 +196,14 @@ class MetaInfo {
    */
   bool IsVerticalFederated() const;
 
+  /*!
+   * \brief A convenient method to check if the MetaInfo should contain labels.
+   *
+   * Normally we assume labels are available everywhere. The only exception is in vertical federated
+   * learning where labels are only available on worker 0.
+   */
+  bool ShouldHaveLabels() const;
+
  private:
   void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
   void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index fe7b65930..b33ca28ef 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -31,18 +31,16 @@ namespace collective {
  * @param buffer The buffer storing the results.
  * @param size The size of the buffer.
  * @param function The function used to calculate the results.
- * @param args Arguments to the function.
  */
-template <typename Function, typename T, typename... Args>
-void ApplyWithLabels(MetaInfo const& info, T* buffer, size_t size, Function&& function,
-                     Args&&... args) {
+template <typename Function>
+void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function) {
   if (info.IsVerticalFederated()) {
     // We assume labels are only available on worker 0, so the calculation is done there and result
     // broadcast to other workers.
     std::string message;
     if (collective::GetRank() == 0) {
       try {
-        std::forward<Function>(function)(std::forward<Args>(args)...);
+        std::forward<Function>(function)();
       } catch (dmlc::Error& e) {
         message = e.what();
       }
@@ -55,7 +53,7 @@ void ApplyWithLabels(MetaInfo const& info, T* buffer, size_t size, Function&& fu
       LOG(FATAL) << &message[0];
     }
   } else {
-    std::forward<Function>(function)(std::forward<Args>(args)...);
+    std::forward<Function>(function)();
   }
 }
 
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index a99ed4f10..f97003d1d 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -45,20 +45,18 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
 
   if (!use_sorted) {
     HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
-                                  HostSketchContainer::UseGroup(info),
-                                  m->Info().IsColumnSplit(), n_threads);
+                                  HostSketchContainer::UseGroup(info), n_threads);
     for (auto const& page : m->GetBatches<SparsePage>()) {
       container.PushRowPage(page, info, hessian);
     }
-    container.MakeCuts(&out);
+    container.MakeCuts(m->Info(), &out);
   } else {
     SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
-                                    HostSketchContainer::UseGroup(info),
-                                    m->Info().IsColumnSplit(), n_threads};
+                                    HostSketchContainer::UseGroup(info), n_threads};
     for (auto const& page : m->GetBatches<SortedCSCPage>()) {
       container.PushColPage(page, info, hessian);
     }
-    container.MakeCuts(&out);
+    container.MakeCuts(m->Info(), &out);
   }
 
   return out;
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index aaf271934..60626052c 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -6,6 +6,7 @@
 #include <limits>
 #include <utility>
 
+#include "../collective/aggregator.h"
 #include "../collective/communicator-inl.h"
 #include "../data/adapter.h"
 #include "categorical.h"
@@ -18,13 +19,12 @@ template <typename WQSketch>
 SketchContainerImpl<WQSketch>::SketchContainerImpl(std::vector<bst_row_t> columns_size,
                                                    int32_t max_bins,
                                                    Span<FeatureType const> feature_types,
-                                                   bool use_group, bool col_split,
+                                                   bool use_group,
                                                    int32_t n_threads)
     : feature_types_(feature_types.cbegin(), feature_types.cend()),
       columns_size_{std::move(columns_size)},
       max_bins_{max_bins},
       use_group_ind_{use_group},
-      col_split_{col_split},
       n_threads_{n_threads} {
   monitor_.Init(__func__);
   CHECK_NE(columns_size_.size(), 0);
@@ -202,10 +202,10 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories() {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
   auto world_size = collective::GetWorldSize();
   auto rank = collective::GetRank();
-  if (world_size == 1 || col_split_) {
+  if (world_size == 1 || info.IsColumnSplit()) {
     return;
   }
 
@@ -273,6 +273,7 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories() {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::AllReduce(
+    MetaInfo const& info,
     std::vector<typename WQSketch::SummaryContainer> *p_reduced,
     std::vector<int32_t>* p_num_cuts) {
   monitor_.Start(__func__);
@@ -281,7 +282,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
   collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
   CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
 
-  AllreduceCategories();
+  AllreduceCategories(info);
 
   auto& num_cuts = *p_num_cuts;
   CHECK_EQ(num_cuts.size(), 0);
@@ -292,10 +293,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
 
   // Prune the intermediate num cuts for synchronization.
   std::vector<bst_row_t> global_column_size(columns_size_);
-  if (!col_split_) {
-    collective::Allreduce<collective::Operation::kSum>(global_column_size.data(),
-                                                       global_column_size.size());
-  }
+  collective::GlobalSum(info, &global_column_size);
 
   ParallelFor(sketches_.size(), n_threads_, [&](size_t i) {
     int32_t intermediate_num_cuts = static_cast<int32_t>(
@@ -316,7 +314,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
   });
 
   auto world = collective::GetWorldSize();
-  if (world == 1 || col_split_) {
+  if (world == 1 || info.IsColumnSplit()) {
     monitor_.Stop(__func__);
     return;
   }
@@ -382,11 +380,11 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
+void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const& info, HistogramCuts* cuts) {
   monitor_.Start(__func__);
   std::vector<typename WQSketch::SummaryContainer> reduced;
   std::vector<int32_t> num_cuts;
-  this->AllReduce(&reduced, &num_cuts);
+  this->AllReduce(info, &reduced, &num_cuts);
 
   cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
   std::vector<typename WQSketch::SummaryContainer> final_summaries(reduced.size());
@@ -443,8 +441,8 @@ template class SketchContainerImpl<WXQuantileSketch<float, float>>;
 
 HostSketchContainer::HostSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
                                          std::vector<size_t> columns_size, bool use_group,
-                                         bool col_split, int32_t n_threads)
-    : SketchContainerImpl{columns_size, max_bins, ft, use_group, col_split, n_threads} {
+                                         int32_t n_threads)
+    : SketchContainerImpl{columns_size, max_bins, ft, use_group, n_threads} {
   monitor_.Init(__func__);
   ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
     auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
diff --git a/src/common/quantile.h b/src/common/quantile.h
index a19b4bbb0..f8d347112 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -789,7 +789,6 @@ class SketchContainerImpl {
   std::vector<bst_row_t> columns_size_;
   int32_t max_bins_;
   bool use_group_ind_{false};
-  bool col_split_;
   int32_t n_threads_;
   bool has_categorical_{false};
   Monitor monitor_;
@@ -802,7 +801,7 @@ class SketchContainerImpl {
    * \param use_group whether is assigned to group to data instance.
    */
   SketchContainerImpl(std::vector<bst_row_t> columns_size, int32_t max_bins,
-                      common::Span<FeatureType const> feature_types, bool use_group, bool col_split,
+                      common::Span<FeatureType const> feature_types, bool use_group,
                       int32_t n_threads);
 
   static bool UseGroup(MetaInfo const &info) {
@@ -829,7 +828,7 @@ class SketchContainerImpl {
                         std::vector<bst_row_t> *p_sketches_scan,
                         std::vector<typename WQSketch::Entry> *p_global_sketches);
   // Merge sketches from all workers.
-  void AllReduce(std::vector<typename WQSketch::SummaryContainer> *p_reduced,
+  void AllReduce(MetaInfo const& info, std::vector<typename WQSketch::SummaryContainer> *p_reduced,
                  std::vector<int32_t> *p_num_cuts);
 
   template <typename Batch, typename IsValid>
@@ -883,11 +882,11 @@ class SketchContainerImpl {
   /* \brief Push a CSR matrix. */
   void PushRowPage(SparsePage const &page, MetaInfo const &info, Span<float const> hessian = {});
 
-  void MakeCuts(HistogramCuts* cuts);
+  void MakeCuts(MetaInfo const& info, HistogramCuts* cuts);
 
  private:
   // Merge all categories from other workers.
-  void AllreduceCategories();
+  void AllreduceCategories(MetaInfo const& info);
 };
 
 class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, float>> {
@@ -896,8 +895,7 @@ class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, fl
 
  public:
   HostSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
-                      std::vector<size_t> columns_size, bool use_group, bool col_split,
-                      int32_t n_threads);
+                      std::vector<size_t> columns_size, bool use_group, int32_t n_threads);
 
   template <typename Batch>
   void PushAdapterBatch(Batch const &batch, size_t base_rowid, MetaInfo const &info, float missing);
@@ -993,9 +991,9 @@ class SortedSketchContainer : public SketchContainerImpl<WXQuantileSketch<float,
 
  public:
   explicit SortedSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
-                                 std::vector<size_t> columns_size, bool use_group, bool col_split,
+                                 std::vector<size_t> columns_size, bool use_group,
                                  int32_t n_threads)
-      : SketchContainerImpl{columns_size, max_bins, ft, use_group, col_split, n_threads} {
+      : SketchContainerImpl{columns_size, max_bins, ft, use_group, n_threads} {
     monitor_.Init(__func__);
     sketches_.resize(columns_size.size());
     size_t i = 0;
diff --git a/src/data/data.cc b/src/data/data.cc
index 694bc48b9..9f85e7db2 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -774,6 +774,10 @@ bool MetaInfo::IsVerticalFederated() const {
   return collective::IsFederated() && IsColumnSplit();
 }
 
+bool MetaInfo::ShouldHaveLabels() const {
+  return !IsVerticalFederated() || collective::GetRank() == 0;
+}
+
 using DMatrixThreadLocal =
     dmlc::ThreadLocalStore<std::map<DMatrix const *, XGBAPIThreadLocalEntry>>;
 
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 1bf755915..3a473122a 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -213,7 +213,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
         SyncFeatureType(&h_ft);
         p_sketch.reset(new common::HostSketchContainer{
             batch_param_.max_bin, h_ft, column_sizes, !proxy->Info().group_ptr_.empty(),
-            proxy->Info().IsColumnSplit(), ctx_.Threads()});
+            ctx_.Threads()});
       }
       HostAdapterDispatch(proxy, [&](auto const& batch) {
         proxy->Info().num_nonzero_ = batch_nnz[i];
@@ -228,7 +228,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
     CHECK_EQ(accumulated_rows, Info().num_row_);
 
     CHECK(p_sketch);
-    p_sketch->MakeCuts(&cuts);
+    p_sketch->MakeCuts(Info(), &cuts);
   }
   if (!h_ft.empty()) {
     CHECK_EQ(h_ft.size(), n_features);
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index 32fda9ef1..b195dffd7 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -99,44 +99,40 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
   auto h_predt = linalg::MakeTensorView(ctx, predt.ConstHostSpan(), info.num_row_,
                                         predt.Size() / info.num_row_);
 
-  if (!info.IsVerticalFederated() || collective::GetRank() == 0) {
-    // loop over each leaf
-    common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
-      auto nidx = h_node_idx[k];
-      CHECK(tree[nidx].IsLeaf());
-      CHECK_LT(k + 1, h_node_ptr.size());
-      size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
-      auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
+  collective::ApplyWithLabels(
+      info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
+        // loop over each leaf
+        common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
+          auto nidx = h_node_idx[k];
+          CHECK(tree[nidx].IsLeaf());
+          CHECK_LT(k + 1, h_node_ptr.size());
+          size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
+          auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
 
-      auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
-      auto h_weights = linalg::MakeVec(&info.weights_);
+          auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
+          auto h_weights = linalg::MakeVec(&info.weights_);
 
-      auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
-        auto row_idx = h_row_set[i];
-        return h_labels(row_idx) - h_predt(row_idx, group_idx);
+          auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
+            auto row_idx = h_row_set[i];
+            return h_labels(row_idx) - h_predt(row_idx, group_idx);
+          });
+          auto w_it = common::MakeIndexTransformIter([&](size_t i) -> float {
+            auto row_idx = h_row_set[i];
+            return h_weights(row_idx);
+          });
+
+          float q{0};
+          if (info.weights_.Empty()) {
+            q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
+          } else {
+            q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
+          }
+          if (std::isnan(q)) {
+            CHECK(h_row_set.empty());
+          }
+          quantiles.at(k) = q;
+        });
       });
-      auto w_it = common::MakeIndexTransformIter([&](size_t i) -> float {
-        auto row_idx = h_row_set[i];
-        return h_weights(row_idx);
-      });
-
-      float q{0};
-      if (info.weights_.Empty()) {
-        q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
-      } else {
-        q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
-      }
-      if (std::isnan(q)) {
-        CHECK(h_row_set.empty());
-      }
-      quantiles.at(k) = q;
-    });
-  }
-
-  if (info.IsVerticalFederated()) {
-    collective::Broadcast(static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float),
-                          0);
-  }
 
   UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
 }
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index b34f37ff9..f94b5edf0 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -36,7 +36,7 @@ class QuantileRegression : public ObjFunction {
   bst_target_t Targets(MetaInfo const& info) const override {
     auto const& alpha = param_.quantile_alpha.Get();
     CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
-    if (!info.IsVerticalFederated() || collective::GetRank() == 0) {
+    if (info.ShouldHaveLabels()) {
       CHECK_EQ(info.labels.Shape(1), 1)
           << "Multi-target is not yet supported by the quantile loss.";
     }
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 3cd32ea0c..4771cc9bf 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -73,7 +73,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   auto hess = Span<float const>{hessian};
 
   ContainerType<use_column> sketch_distributed(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                               column_size, false, false, AllThreadsForTest());
+                                               column_size, false, AllThreadsForTest());
 
   if (use_column) {
     for (auto const& page : m->GetBatches<SortedCSCPage>()) {
@@ -86,7 +86,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   }
 
   HistogramCuts distributed_cuts;
-  sketch_distributed.MakeCuts(&distributed_cuts);
+  sketch_distributed.MakeCuts(m->Info(), &distributed_cuts);
 
   // Generate cuts for single node environment
   collective::Finalize();
@@ -94,7 +94,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
   m->Info().num_row_ = world * rows;
   ContainerType<use_column> sketch_on_single_node(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                  column_size, false, false, AllThreadsForTest());
+                                                  column_size, false, AllThreadsForTest());
   m->Info().num_row_ = rows;
 
   for (auto rank = 0; rank < world; ++rank) {
@@ -117,7 +117,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   }
 
   HistogramCuts single_node_cuts;
-  sketch_on_single_node.MakeCuts(&single_node_cuts);
+  sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts);
 
   auto const& sptrs = single_node_cuts.Ptrs();
   auto const& dptrs = distributed_cuts.Ptrs();
@@ -205,7 +205,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
   HistogramCuts distributed_cuts;
   {
     ContainerType<use_column> sketch_distributed(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                 column_size, false, true, AllThreadsForTest());
+                                                 column_size, false, AllThreadsForTest());
 
     std::vector<float> hessian(rows, 1.0);
     auto hess = Span<float const>{hessian};
@@ -219,7 +219,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
       }
     }
 
-    sketch_distributed.MakeCuts(&distributed_cuts);
+    sketch_distributed.MakeCuts(m->Info(), &distributed_cuts);
   }
 
   // Generate cuts for single node environment
@@ -228,7 +228,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
   HistogramCuts single_node_cuts;
   {
     ContainerType<use_column> sketch_on_single_node(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                    column_size, false, false, AllThreadsForTest());
+                                                    column_size, false, AllThreadsForTest());
 
     std::vector<float> hessian(rows, 1.0);
     auto hess = Span<float const>{hessian};
@@ -242,7 +242,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
       }
     }
 
-    sketch_on_single_node.MakeCuts(&single_node_cuts);
+    sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts);
   }
 
   auto const& sptrs = single_node_cuts.Ptrs();

From 0e7377ba9c559597056d4a525a04ffd643b3afd7 Mon Sep 17 00:00:00 2001
From: Boris <mail@dotbg.name>
Date: Wed, 26 Apr 2023 12:41:11 +0200
Subject: [PATCH 22/34] Updated flink 1.8 -> 1.17. Added smoke tests for Flink
 (#9046)

---
 .github/workflows/jvm_tests.yml               |   4 +-
 jvm-packages/pom.xml                          |   2 +-
 jvm-packages/xgboost4j-example/pom.xml        |   9 +-
 .../flink/DistTrainWithFlinkExample.java      | 107 ++++++++++
 .../example/flink/DistTrainWithFlink.scala    |  91 +++++++--
 .../flink/DistTrainWithFlinkExampleTest.scala |  36 ++++
 .../flink/DistTrainWithFlinkSuite.scala       |  37 ++++
 jvm-packages/xgboost4j-flink/pom.xml          |  25 +--
 .../ml/dmlc/xgboost4j/java/flink/XGBoost.java | 187 ++++++++++++++++++
 .../xgboost4j/java/flink/XGBoostModel.java    | 136 +++++++++++++
 .../dmlc/xgboost4j/scala/flink/XGBoost.scala  |  99 ----------
 .../xgboost4j/scala/flink/XGBoostModel.scala  |  67 -------
 12 files changed, 591 insertions(+), 209 deletions(-)
 create mode 100644 jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExample.java
 create mode 100644 jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExampleTest.scala
 create mode 100644 jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlinkSuite.scala
 create mode 100644 jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java
 create mode 100644 jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoostModel.java
 delete mode 100644 jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoost.scala
 delete mode 100644 jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoostModel.scala

diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index 8efcdc2ec..a2d8bb69a 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -40,7 +40,7 @@ jobs:
         key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
         restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
 
-    - name: Test XGBoost4J
+    - name: Test XGBoost4J (Core)
       run: |
         cd jvm-packages
         mvn test -B -pl :xgboost4j_2.12
@@ -67,7 +67,7 @@ jobs:
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
 
 
-    - name: Test XGBoost4J-Spark
+    - name: Test XGBoost4J (Core, Spark, Examples)
       run: |
         rm -rfv build/
         cd jvm-packages
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 2aac8b00c..0ee7f0b1a 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -33,7 +33,7 @@
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.8.3</flink.version>
+        <flink.version>1.17.0</flink.version>
         <spark.version>3.4.0</spark.version>
         <scala.version>2.12.17</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index d08e4f409..40c9c72a4 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
+            <version>${project.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
@@ -37,12 +37,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>3.12.0</version>
+            <version>${project.version}</version>
         </dependency>
     </dependencies>
 </project>
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExample.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExample.java
new file mode 100644
index 000000000..94e5cdab5
--- /dev/null
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExample.java
@@ -0,0 +1,107 @@
+/*
+ Copyright (c) 2014-2021 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.java.example.flink;
+
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.flink.api.common.typeinfo.TypeHint;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.ExecutionEnvironment;
+import org.apache.flink.api.java.operators.MapOperator;
+import org.apache.flink.api.java.tuple.Tuple13;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.api.java.utils.DataSetUtils;
+import org.apache.flink.ml.linalg.DenseVector;
+import org.apache.flink.ml.linalg.Vector;
+import org.apache.flink.ml.linalg.Vectors;
+
+import ml.dmlc.xgboost4j.java.flink.XGBoost;
+import ml.dmlc.xgboost4j.java.flink.XGBoostModel;
+
+
+public class DistTrainWithFlinkExample {
+
+  static Tuple2<XGBoostModel, DataSet<Float[]>> runPrediction(
+      ExecutionEnvironment env,
+      java.nio.file.Path trainPath,
+      int percentage) throws Exception {
+    // reading data
+    final DataSet<Tuple2<Long, Tuple2<Vector, Double>>> data =
+        DataSetUtils.zipWithIndex(parseCsv(env, trainPath));
+    final long size = data.count();
+    final long trainCount = Math.round(size * 0.01 * percentage);
+    final DataSet<Tuple2<Vector, Double>> trainData =
+        data
+          .filter(item -> item.f0 < trainCount)
+          .map(t -> t.f1)
+          .returns(TypeInformation.of(new TypeHint<Tuple2<Vector, Double>>(){}));
+    final DataSet<Vector> testData =
+        data
+          .filter(tuple -> tuple.f0 >= trainCount)
+          .map(t -> t.f1.f0)
+          .returns(TypeInformation.of(new TypeHint<Vector>(){}));
+
+    // define parameters
+    HashMap<String, Object> paramMap = new HashMap<String, Object>(3);
+    paramMap.put("eta", 0.1);
+    paramMap.put("max_depth", 2);
+    paramMap.put("objective", "binary:logistic");
+
+    // number of iterations
+    final int round = 2;
+    // train the model
+    XGBoostModel model = XGBoost.train(trainData, paramMap, round);
+    DataSet<Float[]> predTest = model.predict(testData);
+    return new Tuple2<XGBoostModel, DataSet<Float[]>>(model, predTest);
+  }
+
+  private static MapOperator<Tuple13<Double, String, Double, Double, Double, Integer, Integer,
+      Integer, Integer, Integer, Integer, Integer, Integer>,
+      Tuple2<Vector, Double>> parseCsv(ExecutionEnvironment env, Path trainPath) {
+    return env.readCsvFile(trainPath.toString())
+      .ignoreFirstLine()
+      .types(Double.class, String.class, Double.class, Double.class, Double.class,
+        Integer.class, Integer.class, Integer.class, Integer.class, Integer.class,
+        Integer.class, Integer.class, Integer.class)
+      .map(DistTrainWithFlinkExample::mapFunction);
+  }
+
+  private static Tuple2<Vector, Double> mapFunction(Tuple13<Double, String, Double, Double, Double,
+      Integer, Integer, Integer, Integer, Integer, Integer, Integer, Integer> tuple) {
+    final DenseVector dense = Vectors.dense(tuple.f2, tuple.f3, tuple.f4, tuple.f5, tuple.f6,
+        tuple.f7, tuple.f8, tuple.f9, tuple.f10, tuple.f11, tuple.f12);
+    if (tuple.f1.contains("inf")) {
+      return new Tuple2<Vector, Double>(dense, 1.0);
+    } else {
+      return new Tuple2<Vector, Double>(dense, 0.0);
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    final java.nio.file.Path parentPath = java.nio.file.Paths.get(Arrays.stream(args)
+        .findFirst().orElse("."));
+    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
+    Tuple2<XGBoostModel, DataSet<Float[]>> tuple2 = runPrediction(
+        env, parentPath.resolve("veterans_lung_cancer.csv"), 70
+    );
+    List<Float[]> list = tuple2.f1.collect();
+    System.out.println(list.size());
+  }
+}
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
index 74b24ac35..cb859f62d 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlink.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014 - 2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -15,27 +15,84 @@
  */
 package ml.dmlc.xgboost4j.scala.example.flink
 
-import ml.dmlc.xgboost4j.scala.flink.XGBoost
-import org.apache.flink.api.scala.{ExecutionEnvironment, _}
-import org.apache.flink.ml.MLUtils
+import java.lang.{Double => JDouble, Long => JLong}
+import java.nio.file.{Path, Paths}
+import org.apache.flink.api.java.tuple.{Tuple13, Tuple2}
+import org.apache.flink.api.java.{DataSet, ExecutionEnvironment}
+import org.apache.flink.ml.linalg.{Vector, Vectors}
+import ml.dmlc.xgboost4j.java.flink.{XGBoost, XGBoostModel}
+import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation}
+import org.apache.flink.api.java.utils.DataSetUtils
+
 
 object DistTrainWithFlink {
-  def main(args: Array[String]) {
-    val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
-    // read trainining data
-    val trainData =
-      MLUtils.readLibSVM(env, "/path/to/data/agaricus.txt.train")
-    val testData = MLUtils.readLibSVM(env, "/path/to/data/agaricus.txt.test")
-    // define parameters
-    val paramMap = List(
-      "eta" -> 0.1,
-      "max_depth" -> 2,
-      "objective" -> "binary:logistic").toMap
+  import scala.jdk.CollectionConverters._
+  private val rowTypeHint = TypeInformation.of(new TypeHint[Tuple2[Vector, JDouble]]{})
+  private val testDataTypeHint = TypeInformation.of(classOf[Vector])
+
+  private[flink] def parseCsv(trainPath: Path)(implicit env: ExecutionEnvironment):
+      DataSet[Tuple2[JLong, Tuple2[Vector, JDouble]]] = {
+    DataSetUtils.zipWithIndex(
+    env
+      .readCsvFile(trainPath.toString)
+      .ignoreFirstLine
+      .types(
+        classOf[Double], classOf[String], classOf[Double], classOf[Double], classOf[Double],
+        classOf[Integer], classOf[Integer], classOf[Integer], classOf[Integer],
+        classOf[Integer], classOf[Integer], classOf[Integer], classOf[Integer]
+      )
+      .map((row: Tuple13[Double, String, Double, Double, Double,
+        Integer, Integer, Integer, Integer, Integer, Integer, Integer, Integer]) => {
+        val dense = Vectors.dense(row.f2, row.f3, row.f4,
+          row.f5.toDouble, row.f6.toDouble, row.f7.toDouble, row.f8.toDouble,
+          row.f9.toDouble, row.f10.toDouble, row.f11.toDouble, row.f12.toDouble)
+        val label = if (row.f1.contains("inf")) {
+          JDouble.valueOf(1.0)
+        } else {
+          JDouble.valueOf(0.0)
+        }
+        new Tuple2[Vector, JDouble](dense, label)
+      })
+      .returns(rowTypeHint)
+    )
+  }
+
+  private[flink] def runPrediction(trainPath: Path, percentage: Int)
+                                  (implicit env: ExecutionEnvironment):
+    (XGBoostModel, DataSet[Array[Float]]) = {
+    // read training data
+    val data: DataSet[Tuple2[JLong, Tuple2[Vector, JDouble]]] = parseCsv(trainPath)
+    val trainSize = Math.round(0.01 * percentage * data.count())
+    val trainData: DataSet[Tuple2[Vector, JDouble]] =
+      data.filter(d => d.f0 < trainSize).map(_.f1).returns(rowTypeHint)
+
+
+    val testData: DataSet[Vector] =
+        data
+          .filter(d => d.f0 >= trainSize)
+          .map(_.f1.f0)
+          .returns(testDataTypeHint)
+
+    val paramMap = mapAsJavaMap(Map(
+      ("eta", "0.1".asInstanceOf[AnyRef]),
+      ("max_depth", "2"),
+      ("objective", "binary:logistic"),
+      ("verbosity", "1")
+    ))
+
     // number of iterations
     val round = 2
     // train the model
     val model = XGBoost.train(trainData, paramMap, round)
-    val predTest = model.predict(testData.map{x => x.vector})
-    model.saveModelAsHadoopFile("file:///path/to/xgboost.model")
+    val result = model.predict(testData).map(prediction => prediction.map(Float.unbox))
+    (model, result)
+  }
+
+  def main(args: Array[String]): Unit = {
+    implicit val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
+    val parentPath = Paths.get(args.headOption.getOrElse("."))
+    val (_, predTest) = runPrediction(parentPath.resolve("veterans_lung_cancer.csv"), 70)
+    val list = predTest.collect().asScala
+    println(list.length)
   }
 }
diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExampleTest.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExampleTest.scala
new file mode 100644
index 000000000..b9929639f
--- /dev/null
+++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/java/example/flink/DistTrainWithFlinkExampleTest.scala
@@ -0,0 +1,36 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.java.example.flink
+
+import org.apache.flink.api.java.ExecutionEnvironment
+import org.scalatest.Inspectors._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+
+import java.nio.file.Paths
+
+class DistTrainWithFlinkExampleTest extends AnyFunSuite {
+  private val parentPath = Paths.get("../../").resolve("demo").resolve("data")
+  private val data = parentPath.resolve("veterans_lung_cancer.csv")
+
+  test("Smoke test for scala flink example") {
+    val env = ExecutionEnvironment.createLocalEnvironment(1)
+    val tuple2 = DistTrainWithFlinkExample.runPrediction(env, data, 70)
+    val results = tuple2.f1.collect()
+    results should have size 41
+    forEvery(results)(item => item should have size 1)
+  }
+}
diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlinkSuite.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlinkSuite.scala
new file mode 100644
index 000000000..d9e98d81c
--- /dev/null
+++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/flink/DistTrainWithFlinkSuite.scala
@@ -0,0 +1,37 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+package ml.dmlc.xgboost4j.scala.example.flink
+
+import org.apache.flink.api.java.ExecutionEnvironment
+import org.scalatest.Inspectors._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+
+import java.nio.file.Paths
+import scala.jdk.CollectionConverters._
+
+class DistTrainWithFlinkSuite extends AnyFunSuite {
+  private val parentPath = Paths.get("../../").resolve("demo").resolve("data")
+  private val data = parentPath.resolve("veterans_lung_cancer.csv")
+
+  test("Smoke test for scala flink example") {
+    implicit val env: ExecutionEnvironment = ExecutionEnvironment.createLocalEnvironment(1)
+    val (_, result) = DistTrainWithFlink.runPrediction(data, 70)
+    val results = result.collect().asScala
+    results should have size 41
+    forEvery(results)(item => item should have size 1)
+  }
+}
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index b8b757eae..a9a80e29a 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -8,8 +8,11 @@
         <artifactId>xgboost-jvm_2.12</artifactId>
         <version>2.0.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j-flink_2.12</artifactId>
+    <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
     <version>2.0.0-SNAPSHOT</version>
+    <properties>
+      <flink-ml.version>2.2.0</flink-ml.version>
+    </properties>
     <build>
         <plugins>
             <plugin>
@@ -26,32 +29,22 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>2.0.0-SNAPSHOT</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>3.12.0</version>
+            <version>${project.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.flink</groupId>
-            <artifactId>flink-scala_${scala.binary.version}</artifactId>
+            <artifactId>flink-clients</artifactId>
             <version>${flink.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.flink</groupId>
-            <artifactId>flink-clients_${scala.binary.version}</artifactId>
-            <version>${flink.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.flink</groupId>
-            <artifactId>flink-ml_${scala.binary.version}</artifactId>
-            <version>${flink.version}</version>
+            <artifactId>flink-ml-servable-core</artifactId>
+            <version>${flink-ml.version}</version>
         </dependency>
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
-            <version>3.3.5</version>
+            <version>${hadoop.version}</version>
         </dependency>
     </dependencies>
 
diff --git a/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java
new file mode 100644
index 000000000..7a5e3ac68
--- /dev/null
+++ b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoost.java
@@ -0,0 +1,187 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.java.flink;
+
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Optional;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import org.apache.flink.api.common.functions.RichMapPartitionFunction;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.ml.linalg.SparseVector;
+import org.apache.flink.ml.linalg.Vector;
+import org.apache.flink.util.Collector;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import ml.dmlc.xgboost4j.LabeledPoint;
+import ml.dmlc.xgboost4j.java.Booster;
+import ml.dmlc.xgboost4j.java.Communicator;
+import ml.dmlc.xgboost4j.java.DMatrix;
+import ml.dmlc.xgboost4j.java.RabitTracker;
+import ml.dmlc.xgboost4j.java.XGBoostError;
+
+
+public class XGBoost {
+  private static final Logger logger = LoggerFactory.getLogger(XGBoost.class);
+
+  private static class MapFunction
+      extends RichMapPartitionFunction<Tuple2<Vector, Double>, XGBoostModel> {
+
+    private final Map<String, Object> params;
+    private final int round;
+    private final Map<String, String> workerEnvs;
+
+    public MapFunction(Map<String, Object> params, int round, Map<String, String> workerEnvs) {
+      this.params = params;
+      this.round = round;
+      this.workerEnvs = workerEnvs;
+    }
+
+    public void mapPartition(java.lang.Iterable<Tuple2<Vector, Double>> it,
+                             Collector<XGBoostModel> collector) throws XGBoostError {
+      workerEnvs.put(
+          "DMLC_TASK_ID",
+          String.valueOf(this.getRuntimeContext().getIndexOfThisSubtask())
+      );
+
+      if (logger.isInfoEnabled()) {
+        logger.info("start with env: {}", workerEnvs.entrySet().stream()
+            .map(e -> String.format("\"%s\": \"%s\"", e.getKey(), e.getValue()))
+            .collect(Collectors.joining(", "))
+        );
+      }
+
+      final Iterator<LabeledPoint> dataIter =
+          StreamSupport
+            .stream(it.spliterator(), false)
+            .map(VectorToPointMapper.INSTANCE)
+            .iterator();
+
+      if (dataIter.hasNext()) {
+        final DMatrix trainMat = new DMatrix(dataIter, null);
+        int numEarlyStoppingRounds =
+            Optional.ofNullable(params.get("numEarlyStoppingRounds"))
+              .map(x -> Integer.parseInt(x.toString()))
+              .orElse(0);
+
+        final Booster booster = trainBooster(trainMat, numEarlyStoppingRounds);
+        collector.collect(new XGBoostModel(booster));
+      } else {
+        logger.warn("Nothing to train with.");
+      }
+    }
+
+    private Booster trainBooster(DMatrix trainMat,
+                                 int numEarlyStoppingRounds) throws XGBoostError {
+      Booster booster;
+      final Map<String, DMatrix> watches =
+          new HashMap<String, DMatrix>() {{ put("train", trainMat); }};
+      try {
+        Communicator.init(workerEnvs);
+        booster = ml.dmlc.xgboost4j.java.XGBoost
+          .train(
+            trainMat,
+            params,
+            round,
+            watches,
+            null,
+            null,
+            null,
+            numEarlyStoppingRounds);
+      } catch (XGBoostError xgbException) {
+        final String identifier = String.valueOf(this.getRuntimeContext().getIndexOfThisSubtask());
+        logger.warn(
+            String.format("XGBooster worker %s has failed due to", identifier),
+            xgbException
+        );
+        throw xgbException;
+      } finally {
+        Communicator.shutdown();
+      }
+      return booster;
+    }
+
+    private static class VectorToPointMapper
+        implements Function<Tuple2<Vector, Double>, LabeledPoint> {
+      public static VectorToPointMapper INSTANCE = new VectorToPointMapper();
+      @Override
+      public LabeledPoint apply(Tuple2<Vector, Double> tuple) {
+        final SparseVector vector = tuple.f0.toSparse();
+        final double[] values = vector.values;
+        final int size = values.length;
+        final float[] array = new float[size];
+        for (int i = 0; i < size; i++) {
+          array[i] = (float) values[i];
+        }
+        return new LabeledPoint(
+          tuple.f1.floatValue(),
+          vector.size(),
+          vector.indices,
+          array);
+      }
+    }
+  }
+
+  /**
+   * Load XGBoost model from path, using Hadoop Filesystem API.
+   *
+   * @param modelPath The path that is accessible by hadoop filesystem API.
+   * @return The loaded model
+   */
+  public static XGBoostModel loadModelFromHadoopFile(final String modelPath) throws Exception {
+    final FileSystem fileSystem = FileSystem.get(new Configuration());
+    final Path f = new Path(modelPath);
+
+    try (FSDataInputStream opened = fileSystem.open(f)) {
+      return new XGBoostModel(ml.dmlc.xgboost4j.java.XGBoost.loadModel(opened));
+    }
+  }
+
+  /**
+   * Train a xgboost model with link.
+   *
+   * @param dtrain The training data.
+   * @param params XGBoost parameters.
+   * @param numBoostRound  Number of rounds to train.
+   */
+  public static XGBoostModel train(DataSet<Tuple2<Vector, Double>> dtrain,
+                                   Map<String, Object> params,
+                                   int numBoostRound) throws Exception {
+    final RabitTracker tracker =
+        new RabitTracker(dtrain.getExecutionEnvironment().getParallelism());
+    if (tracker.start(0L)) {
+      return dtrain
+        .mapPartition(new MapFunction(params, numBoostRound, tracker.getWorkerEnvs()))
+        .reduce((x, y) -> x)
+        .collect()
+        .get(0);
+    } else {
+      throw new Error("Tracker cannot be started");
+    }
+  }
+}
diff --git a/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoostModel.java b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoostModel.java
new file mode 100644
index 000000000..03de50482
--- /dev/null
+++ b/jvm-packages/xgboost4j-flink/src/main/java/ml/dmlc/xgboost4j/java/flink/XGBoostModel.java
@@ -0,0 +1,136 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.java.flink;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.stream.StreamSupport;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.flink.api.common.functions.MapPartitionFunction;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.ml.linalg.SparseVector;
+import org.apache.flink.ml.linalg.Vector;
+import org.apache.flink.util.Collector;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import ml.dmlc.xgboost4j.LabeledPoint;
+import ml.dmlc.xgboost4j.java.Booster;
+import ml.dmlc.xgboost4j.java.DMatrix;
+import ml.dmlc.xgboost4j.java.XGBoostError;
+
+
+public class XGBoostModel implements Serializable {
+  private static final org.slf4j.Logger logger =
+      org.slf4j.LoggerFactory.getLogger(XGBoostModel.class);
+
+  private final Booster booster;
+  private final PredictorFunction predictorFunction;
+
+
+  public XGBoostModel(Booster booster) {
+    this.booster = booster;
+    this.predictorFunction = new PredictorFunction(booster);
+  }
+
+  /**
+   * Save the model as a Hadoop filesystem file.
+   *
+   * @param modelPath The model path as in Hadoop path.
+   */
+  public void saveModelAsHadoopFile(String modelPath) throws IOException, XGBoostError {
+    booster.saveModel(FileSystem.get(new Configuration()).create(new Path(modelPath)));
+  }
+
+  public byte[] toByteArray(String format) throws XGBoostError {
+    return booster.toByteArray(format);
+  }
+
+  /**
+   * Save the model as a Hadoop filesystem file.
+   *
+   * @param modelPath The model path as in Hadoop path.
+   * @param format The model format (ubj, json, deprecated)
+   * @throws XGBoostError internal error
+   * @throws IOException save error
+   */
+  public void saveModelAsHadoopFile(String modelPath, String format)
+      throws IOException, XGBoostError {
+    booster.saveModel(FileSystem.get(new Configuration()).create(new Path(modelPath)), format);
+  }
+
+  /**
+   * predict with the given DMatrix
+   *
+   * @param testSet the local test set represented as DMatrix
+   * @return prediction result
+   */
+  public float[][] predict(DMatrix testSet) throws XGBoostError {
+    return booster.predict(testSet, true, 0);
+  }
+
+  /**
+   * Predict given vector dataset.
+   *
+   * @param data The dataset to be predicted.
+   * @return The prediction result.
+   */
+  public DataSet<Float[]> predict(DataSet<Vector> data) {
+    return data.mapPartition(predictorFunction);
+  }
+
+
+  private static class PredictorFunction implements MapPartitionFunction<Vector, Float[]> {
+
+    private final Booster booster;
+
+    public PredictorFunction(Booster booster) {
+      this.booster = booster;
+    }
+
+    @Override
+    public void mapPartition(Iterable<Vector> it, Collector<Float[]> out) throws Exception {
+      final Iterator<LabeledPoint> dataIter =
+          StreamSupport.stream(it.spliterator(), false)
+            .map(Vector::toSparse)
+            .map(PredictorFunction::fromVector)
+            .iterator();
+
+      if (dataIter.hasNext()) {
+        final DMatrix data = new DMatrix(dataIter, null);
+        float[][] predictions = booster.predict(data, true, 2);
+        Arrays.stream(predictions).map(ArrayUtils::toObject).forEach(out::collect);
+      } else {
+        logger.debug("Empty partition");
+      }
+    }
+
+    private static LabeledPoint fromVector(SparseVector vector) {
+      final int[] index = vector.indices;
+      final double[] value = vector.values;
+      int size = value.length;
+      final float[] values = new float[size];
+      for (int i = 0; i < size; i++) {
+        values[i] = (float) value[i];
+      }
+      return new LabeledPoint(0.0f, vector.size(), index, values);
+    }
+  }
+}
diff --git a/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoost.scala b/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoost.scala
deleted file mode 100644
index 6878f1865..000000000
--- a/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoost.scala
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.flink
-
-import scala.collection.JavaConverters.asScalaIteratorConverter
-
-import ml.dmlc.xgboost4j.LabeledPoint
-import ml.dmlc.xgboost4j.java.{Communicator, RabitTracker}
-import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => XGBoostScala}
-
-import org.apache.commons.logging.LogFactory
-import org.apache.flink.api.common.functions.RichMapPartitionFunction
-import org.apache.flink.api.scala.{DataSet, _}
-import org.apache.flink.ml.common.LabeledVector
-import org.apache.flink.util.Collector
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-
-object XGBoost {
-  /**
-    * Helper map function to start the job.
-    *
-    * @param workerEnvs
-    */
-  private class MapFunction(paramMap: Map[String, Any],
-                            round: Int,
-                            workerEnvs: java.util.Map[String, String])
-    extends RichMapPartitionFunction[LabeledVector, XGBoostModel] {
-    val logger = LogFactory.getLog(this.getClass)
-
-    def mapPartition(it: java.lang.Iterable[LabeledVector],
-                     collector: Collector[XGBoostModel]): Unit = {
-      workerEnvs.put("DMLC_TASK_ID", String.valueOf(this.getRuntimeContext.getIndexOfThisSubtask))
-      logger.info("start with env" + workerEnvs.toString)
-      Communicator.init(workerEnvs)
-      val mapper = (x: LabeledVector) => {
-        val (index, value) = x.vector.toSeq.unzip
-        LabeledPoint(x.label.toFloat, x.vector.size, index.toArray, value.map(_.toFloat).toArray)
-      }
-      val dataIter = for (x <- it.iterator().asScala) yield mapper(x)
-      val trainMat = new DMatrix(dataIter, null)
-      val watches = List("train" -> trainMat).toMap
-      val round = 2
-      val numEarlyStoppingRounds = paramMap.get("numEarlyStoppingRounds")
-          .map(_.toString.toInt).getOrElse(0)
-      val booster = XGBoostScala.train(trainMat, paramMap, round, watches,
-        earlyStoppingRound = numEarlyStoppingRounds)
-      Communicator.shutdown()
-      collector.collect(new XGBoostModel(booster))
-    }
-  }
-
-  val logger = LogFactory.getLog(this.getClass)
-
-  /**
-    * Load XGBoost model from path, using Hadoop Filesystem API.
-    *
-    * @param modelPath The path that is accessible by hadoop filesystem API.
-    * @return The loaded model
-    */
-  def loadModelFromHadoopFile(modelPath: String) : XGBoostModel = {
-    new XGBoostModel(
-      XGBoostScala.loadModel(FileSystem.get(new Configuration).open(new Path(modelPath))))
-  }
-
-  /**
-    * Train a xgboost model with link.
-    *
-    * @param dtrain The training data.
-    * @param params The parameters to XGBoost.
-    * @param round Number of rounds to train.
-    */
-  def train(dtrain: DataSet[LabeledVector], params: Map[String, Any], round: Int):
-      XGBoostModel = {
-    val tracker = new RabitTracker(dtrain.getExecutionEnvironment.getParallelism)
-    if (tracker.start(0L)) {
-      dtrain
-        .mapPartition(new MapFunction(params, round, tracker.getWorkerEnvs))
-        .reduce((x, y) => x).collect().head
-    } else {
-      throw new Error("Tracker cannot be started")
-      null
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoostModel.scala b/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoostModel.scala
deleted file mode 100644
index 71b376974..000000000
--- a/jvm-packages/xgboost4j-flink/src/main/scala/ml/dmlc/xgboost4j/scala/flink/XGBoostModel.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package ml.dmlc.xgboost4j.scala.flink
-
-import ml.dmlc.xgboost4j.LabeledPoint
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
-
-import org.apache.flink.api.scala.{DataSet, _}
-import org.apache.flink.ml.math.Vector
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-
-class XGBoostModel (booster: Booster) extends Serializable {
-  /**
-    * Save the model as a Hadoop filesystem file.
-    *
-    * @param modelPath The model path as in Hadoop path.
-    */
-  def saveModelAsHadoopFile(modelPath: String): Unit = {
-    booster.saveModel(FileSystem
-      .get(new Configuration)
-      .create(new Path(modelPath)))
-  }
-
-  /**
-   * predict with the given DMatrix
-   * @param testSet the local test set represented as DMatrix
-   * @return prediction result
-   */
-  def predict(testSet: DMatrix): Array[Array[Float]] = {
-    booster.predict(testSet, true, 0)
-  }
-
-  /**
-    * Predict given vector dataset.
-    *
-    * @param data The dataset to be predicted.
-    * @return The prediction result.
-    */
-  def predict(data: DataSet[Vector]) : DataSet[Array[Float]] = {
-    val predictMap: Iterator[Vector] => Traversable[Array[Float]] =
-      (it: Iterator[Vector]) => {
-        val mapper = (x: Vector) => {
-          val (index, value) = x.toSeq.unzip
-          LabeledPoint(0.0f, x.size, index.toArray, value.map(_.toFloat).toArray)
-        }
-        val dataIter = for (x <- it) yield mapper(x)
-        val dmat = new DMatrix(dataIter, null)
-        this.booster.predict(dmat)
-      }
-    data.mapPartition(predictMap)
-  }
-}

From 353ed5339db8d60c146b27211e923369d5ce2511 Mon Sep 17 00:00:00 2001
From: Scott Gustafson <s1gustaf@gmail.com>
Date: Wed, 26 Apr 2023 14:23:35 -0400
Subject: [PATCH 23/34] Convert ``DaskXGBClassifier.classes_`` to an array
 (#8452)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/dask.py                |  8 +++++++-
 .../test_with_dask/test_with_dask.py          | 19 +++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index 88bd1c819..35c5c009f 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -73,6 +73,7 @@ from .core import (
     _deprecate_positional_args,
     _expect,
 )
+from .data import _is_cudf_ser, _is_cupy_array
 from .sklearn import (
     XGBClassifier,
     XGBClassifierBase,
@@ -1894,10 +1895,15 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
         )
 
         # pylint: disable=attribute-defined-outside-init
-        if isinstance(y, (da.Array)):
+        if isinstance(y, da.Array):
             self.classes_ = await self.client.compute(da.unique(y))
         else:
             self.classes_ = await self.client.compute(y.drop_duplicates())
+        if _is_cudf_ser(self.classes_):
+            self.classes_ = self.classes_.to_cupy()
+        if _is_cupy_array(self.classes_):
+            self.classes_ = self.classes_.get()
+        self.classes_ = numpy.array(self.classes_)
         self.n_classes_ = len(self.classes_)
 
         if self.n_classes_ > 2:
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 0bf952025..5e9303a46 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -192,6 +192,25 @@ def deterministic_repartition(
     return X, y, m
 
 
+@pytest.mark.parametrize("to_frame", [True, False])
+def test_xgbclassifier_classes_type_and_value(to_frame: bool, client: "Client"):
+    X, y = make_classification(n_samples=1000, n_features=4, random_state=123)
+    if to_frame:
+        import pandas as pd
+        feats = [f"var_{i}" for i in range(4)]
+        df = pd.DataFrame(X, columns=feats)
+        df["target"] = y
+        df = dd.from_pandas(df, npartitions=1)
+        X, y = df[feats], df["target"]
+    else:
+        X = da.from_array(X)
+        y = da.from_array(y)
+
+    est = xgb.dask.DaskXGBClassifier(n_estimators=10).fit(X, y)
+    assert isinstance(est.classes_, np.ndarray)
+    np.testing.assert_array_equal(est.classes_, np.array([0, 1]))
+
+
 def test_from_dask_dataframe() -> None:
     with LocalCluster(n_workers=kWorkers, dashboard_address=":0") as cluster:
         with Client(cluster) as client:

From 101a2e643df01342b1ebfffa69414ae6112d6b91 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 27 Apr 2023 09:46:46 +0800
Subject: [PATCH 24/34] [jvm-packages] Bump rapids version. (#9097)

---
 jvm-packages/pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 0ee7f0b1a..4903b8f38 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -41,8 +41,8 @@
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>23.02.0</cudf.version>
-        <spark.rapids.version>23.02.0</spark.rapids.version>
+        <cudf.version>23.04.0</cudf.version>
+        <spark.rapids.version>23.04.0</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
     </properties>
     <repositories>

From 511d4996b56424b51e9fef4ca6e7e71f48ca237b Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 26 Apr 2023 18:48:26 -0700
Subject: [PATCH 25/34] Rely on gRPC to generate random port (#9102)

---
 tests/cpp/plugin/helpers.h                    | 33 +++++++------------
 .../cpp/plugin/test_federated_communicator.cc | 31 +++++------------
 2 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index 41e5a63e5..0dbdeeca4 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -13,25 +13,6 @@
 #include "../../../plugin/federated/federated_server.h"
 #include "../../../src/collective/communicator-inl.h"
 
-inline int GenerateRandomPort(int low, int high) {
-  using namespace std::chrono_literals;
-  // Ensure unique timestamp by introducing a small artificial delay
-  std::this_thread::sleep_for(100ms);
-  auto timestamp = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(
-                                             std::chrono::system_clock::now().time_since_epoch())
-                                             .count());
-  std::mt19937_64 rng(timestamp);
-  std::uniform_int_distribution<int> dist(low, high);
-  int port = dist(rng);
-  return port;
-}
-
-inline std::string GetServerAddress() {
-  int port = GenerateRandomPort(50000, 60000);
-  std::string address = std::string("localhost:") + std::to_string(port);
-  return address;
-}
-
 namespace xgboost {
 
 class ServerForTest {
@@ -41,13 +22,14 @@ class ServerForTest {
 
  public:
   explicit ServerForTest(std::int32_t world_size) {
-    server_address_ = GetServerAddress();
     server_thread_.reset(new std::thread([this, world_size] {
       grpc::ServerBuilder builder;
       xgboost::federated::FederatedService service{world_size};
-      builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
+      int selected_port;
+      builder.AddListeningPort("localhost:0", grpc::InsecureServerCredentials(), &selected_port);
       builder.RegisterService(&service);
       server_ = builder.BuildAndStart();
+      server_address_ = std::string("localhost:") + std::to_string(selected_port);
       server_->Wait();
     }));
   }
@@ -56,7 +38,14 @@ class ServerForTest {
     server_->Shutdown();
     server_thread_->join();
   }
-  auto Address() const { return server_address_; }
+
+  auto Address() const {
+    using namespace std::chrono_literals;
+    while (server_address_.empty()) {
+      std::this_thread::sleep_for(100ms);
+    }
+    return server_address_;
+  }
 };
 
 class BaseFederatedTest : public ::testing::Test {
diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc
index 340849606..62f33d5ee 100644
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -62,34 +62,24 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
 };
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
-    FederatedCommunicator comm{0, 0, server_address, "", "", ""};
-  };
+  auto construct = [] { FederatedCommunicator comm{0, 0, "localhost:0", "", "", ""}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooSmall) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
-    FederatedCommunicator comm{1, -1, server_address, "", "", ""};
-  };
+  auto construct = [] { FederatedCommunicator comm{1, -1, "localhost:0", "", "", ""}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooBig) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
-    FederatedCommunicator comm{1, 1, server_address, "", "", ""};
-  };
+  auto construct = [] { FederatedCommunicator comm{1, 1, "localhost:0", "", "", ""}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeNotInteger) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
+  auto construct = [] {
     Json config{JsonObject()};
-    config["federated_server_address"] = server_address;
+    config["federated_server_address"] = std::string("localhost:0");
     config["federated_world_size"] = std::string("1");
     config["federated_rank"] = Integer(0);
     FederatedCommunicator::Create(config);
@@ -98,10 +88,9 @@ TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeNotInteger) {
 }
 
 TEST(FederatedCommunicatorSimpleTest, ThrowOnRankNotInteger) {
-  std::string server_address{GetServerAddress()};
-  auto construct = [server_address]() {
+  auto construct = [] {
     Json config{JsonObject()};
-    config["federated_server_address"] = server_address;
+    config["federated_server_address"] = std::string("localhost:0");
     config["federated_world_size"] = 1;
     config["federated_rank"] = std::string("0");
     FederatedCommunicator::Create(config);
@@ -110,15 +99,13 @@ TEST(FederatedCommunicatorSimpleTest, ThrowOnRankNotInteger) {
 }
 
 TEST(FederatedCommunicatorSimpleTest, GetWorldSizeAndRank) {
-  std::string server_address{GetServerAddress()};
-  FederatedCommunicator comm{6, 3, server_address};
+  FederatedCommunicator comm{6, 3, "localhost:0"};
   EXPECT_EQ(comm.GetWorldSize(), 6);
   EXPECT_EQ(comm.GetRank(), 3);
 }
 
 TEST(FederatedCommunicatorSimpleTest, IsDistributed) {
-  std::string server_address{GetServerAddress()};
-  FederatedCommunicator comm{2, 1, server_address};
+  FederatedCommunicator comm{2, 1, "localhost:0"};
   EXPECT_TRUE(comm.IsDistributed());
 }
 

From 96d3f8a6f3db27f6745360f781d367df446fe8db Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 27 Apr 2023 19:29:03 +0800
Subject: [PATCH 26/34] [doc] Update document. (#9098)

- Mention flink is still under construction.
- Update doxygen version.
- Fix warnings from doxygen about defgroup title and mismatched parameter name.
---
 doc/Doxyfile.in         | 754 +++++++++++++++++++++++++++-------------
 doc/jvm/index.rst       |   4 +
 doc/tutorials/dask.rst  |   2 +-
 include/xgboost/c_api.h |  18 +-
 4 files changed, 517 insertions(+), 261 deletions(-)

diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index b159ef172..e24d67282 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.8
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "xgboost"
+PROJECT_NAME           = xgboost
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -46,10 +46,10 @@ PROJECT_NUMBER         = @XGBOOST_VERSION@
 
 PROJECT_BRIEF          =
 
-# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
-# the documentation. The maximum height of the logo should not exceed 55 pixels
-# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
-# to the output directory.
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
 
 PROJECT_LOGO           =
 
@@ -60,7 +60,7 @@ PROJECT_LOGO           =
 
 OUTPUT_DIRECTORY       = @PROJECT_BINARY_DIR@/doc_doxygen
 
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
 # will distribute the generated files over these directories. Enabling this
 # option can be useful when feeding doxygen a huge amount of source files, where
@@ -76,7 +76,7 @@ CREATE_SUBDIRS         = NO
 # U+3044.
 # The default value is: NO.
 
-#ALLOW_UNICODE_NAMES    = NO
+ALLOW_UNICODE_NAMES    = NO
 
 # The OUTPUT_LANGUAGE tag is used to specify the language in which all
 # documentation generated by doxygen is written. Doxygen will use this
@@ -93,14 +93,22 @@ CREATE_SUBDIRS         = NO
 
 OUTPUT_LANGUAGE        = English
 
-# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
 # The default value is: YES.
 
 BRIEF_MEMBER_DESC      = YES
 
-# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
 # description of a member or function before the detailed description
 #
 # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
@@ -135,7 +143,7 @@ ALWAYS_DETAILED_SEC    = NO
 
 INLINE_INHERITED_MEMB  = NO
 
-# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
 # before files name in the file list and in the header files. If set to NO the
 # shortest path that makes the file name unique will be used
 # The default value is: YES.
@@ -179,6 +187,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = NO
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -199,15 +217,23 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
 
 INHERIT_DOCS           = YES
 
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
-# new page for each member. If set to NO, the documentation of a member will be
-# part of the file/class/namespace that contains it.
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
 # The default value is: NO.
 
 SEPARATE_MEMBER_PAGES  = NO
@@ -226,16 +252,15 @@ TAB_SIZE               = 8
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -264,42 +289,63 @@ OPTIMIZE_FOR_FORTRAN   = NO
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
-# Note For files without extension you can use no_extension as a placeholder.
+# Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
 # The default value is: YES.
 
-#MARKDOWN_SUPPORT       = YES
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
 
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by by putting a % sign in front of the word
-# or globally by setting AUTOLINK_SUPPORT to NO.
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
 # The default value is: YES.
 
-#AUTOLINK_SUPPORT       = YES
+AUTOLINK_SUPPORT       = YES
 
 # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
 # to include (a tag file for) the STL sources as input, then you should set this
@@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -336,13 +382,20 @@ SIP_SUPPORT            = NO
 IDL_PROPERTY_SUPPORT   = YES
 
 # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
+# tag is set to YES then doxygen will reuse the documentation of the first
 # member in the group (if any) for the other members of the group. By default
 # all members of a group must be documented explicitly.
 # The default value is: NO.
 
 DISTRIBUTE_GROUP_DOC   = NO
 
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
 # Set the SUBGROUPING tag to YES to allow class member groups of the same type
 # (for instance a group of public functions) to be put as a subgroup of that
 # type (e.g. under the Public Functions section). Set it to NO to prevent
@@ -397,11 +450,24 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
 
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
 # documentation are documented, even if no documentation was available. Private
 # class members and static file members will be hidden unless the
 # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
@@ -411,35 +477,41 @@ LOOKUP_CACHE_SIZE      = 0
 
 EXTRACT_ALL            = YES
 
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
 # be included in the documentation.
 # The default value is: NO.
 
 EXTRACT_PRIVATE        = NO
 
-# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
 
-#EXTRACT_PACKAGE        = NO
+EXTRACT_PACKAGE        = NO
 
-# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
 # included in the documentation.
 # The default value is: NO.
 
 EXTRACT_STATIC         = NO
 
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
 # only classes defined in header files are included. Does not have any effect
 # for Java sources.
 # The default value is: YES.
 
 EXTRACT_LOCAL_CLASSES  = YES
 
-# This flag is only useful for Objective-C code. When set to YES local methods,
+# This flag is only useful for Objective-C code. If set to YES, local methods,
 # which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO only methods in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
 # included.
 # The default value is: NO.
 
@@ -454,6 +526,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -464,21 +543,21 @@ HIDE_UNDOC_MEMBERS     = NO
 
 # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
 # undocumented classes that are normally visible in the class hierarchy. If set
-# to NO these classes will be included in the various overviews. This option has
-# no effect if EXTRACT_ALL is enabled.
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
 # The default value is: NO.
 
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
 
 # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO these
+# documentation blocks found inside the body of a function. If set to NO, these
 # blocks will be appended to the function's detailed documentation block.
 # The default value is: NO.
 
@@ -491,22 +570,36 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
 
 # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES the
+# their full class and namespace scopes in the documentation. If set to YES, the
 # scope will be hidden.
 # The default value is: NO.
 
 HIDE_SCOPE_NAMES       = NO
 
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
@@ -518,7 +611,7 @@ SHOW_INCLUDE_FILES     = YES
 # which file to include in order to use the member.
 # The default value is: NO.
 
-#SHOW_GROUPED_MEMB_INC  = NO
+SHOW_GROUPED_MEMB_INC  = NO
 
 # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
 # files with double quotes in the documentation rather than with sharp brackets.
@@ -534,14 +627,14 @@ INLINE_INFO            = YES
 
 # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
 # (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
+# name. If set to NO, the members will appear in declaration order.
 # The default value is: YES.
 
 SORT_MEMBER_DOCS       = YES
 
 # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
 # descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order. Note that
+# name. If set to NO, the members will appear in declaration order. Note that
 # this will also influence the order of the classes in the class list.
 # The default value is: NO.
 
@@ -586,27 +679,25 @@ SORT_BY_SCOPE_NAME     = NO
 
 STRICT_PROTO_MATCHING  = NO
 
-# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
-# todo list. This list is created by putting \todo commands in the
-# documentation.
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
 # The default value is: YES.
 
 GENERATE_TODOLIST      = YES
 
-# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
-# test list. This list is created by putting \test commands in the
-# documentation.
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
 # The default value is: YES.
 
 GENERATE_TESTLIST      = YES
 
-# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
 # list. This list is created by putting \bug commands in the documentation.
 # The default value is: YES.
 
 GENERATE_BUGLIST       = YES
 
-# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
 # the deprecated list. This list is created by putting \deprecated commands in
 # the documentation.
 # The default value is: YES.
@@ -631,8 +722,8 @@ ENABLED_SECTIONS       =
 MAX_INITIALIZER_LINES  = 30
 
 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES the list
-# will mention the files that were used to generate the documentation.
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
 # The default value is: YES.
 
 SHOW_USED_FILES        = YES
@@ -677,7 +768,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -696,7 +787,7 @@ CITE_BIB_FILES         =
 QUIET                  = NO
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
 # this implies that the warnings are on.
 #
 # Tip: Turn warnings on while writing the documentation.
@@ -704,7 +795,7 @@ QUIET                  = NO
 
 WARNINGS               = YES
 
-# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
 # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
 # will automatically be disabled.
 # The default value is: YES.
@@ -721,12 +812,22 @@ WARN_IF_DOC_ERROR      = YES
 
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
-# value. If set to NO doxygen will only warn about wrong or incomplete parameter
-# documentation, but not about the absence of documentation.
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = YES
 
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
 # The WARN_FORMAT tag determines the format of the warning messages that doxygen
 # can produce. The string should contain the $file, $line, and $text tags, which
 # will be replaced by the file and line number from which the warning originated
@@ -750,7 +851,7 @@ WARN_LOGFILE           =
 # The INPUT tag is used to specify the files and/or directories that contain
 # documented source files. You may enter file names like myfile.cpp or
 # directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = @PROJECT_SOURCE_DIR@/include
@@ -758,20 +859,29 @@ INPUT                  = @PROJECT_SOURCE_DIR@/include
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
 
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank the
-# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
-# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
-# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
-# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
-# *.qsf, *.as and *.js.
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.h
 
@@ -858,6 +968,10 @@ IMAGE_PATH             =
 # Note that the filter must not add or remove lines; it is applied before the
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
@@ -867,11 +981,15 @@ INPUT_FILTER           =
 # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER ) will also be used to filter the input files that are used for
+# INPUT_FILTER) will also be used to filter the input files that are used for
 # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
 # The default value is: NO.
 
@@ -890,7 +1008,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-#USE_MDFILE_AS_MAINPAGE =
+USE_MDFILE_AS_MAINPAGE =
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
@@ -919,7 +1037,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -931,7 +1049,7 @@ REFERENCED_BY_RELATION = NO
 REFERENCES_RELATION    = NO
 
 # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
 # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
 # link to the documentation.
 # The default value is: YES.
@@ -946,17 +1064,17 @@ REFERENCES_LINK_SOURCE = YES
 # The default value is: YES.
 # This tag requires that the tag SOURCE_BROWSER is set to YES.
 
-#SOURCE_TOOLTIPS        = YES
+SOURCE_TOOLTIPS        = YES
 
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -978,16 +1096,22 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
-# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see:
+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
+# performance. This can be particularly helpful with template rich C++ code for
+# which doxygen's built-in parser lacks the necessary type information.
 # Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
+# generated with the -Duse_libclang=ON option for CMake.
 # The default value is: NO.
 
-#CLANG_ASSISTED_PARSING = NO
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to
+# YES then doxygen will add the directory of each input to the include path.
+# The default value is: YES.
+
+CLANG_ADD_INC_PATHS    = YES
 
 # If clang assisted parsing is enabled you can provide the compiler with command
 # line options that you would normally use when invoking the compiler. Note that
@@ -995,7 +1119,20 @@ VERBATIM_HEADERS       = YES
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-#CLANG_OPTIONS          =
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the -p option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1008,13 +1145,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
 # can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1027,7 +1157,7 @@ IGNORE_PREFIX          =
 # Configuration options related to the HTML output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
 # The default value is: YES.
 
 GENERATE_HTML          = YES
@@ -1093,14 +1223,14 @@ HTML_STYLESHEET        =
 # cascading style sheets that are included after the standard style sheets
 # created by doxygen. Using this option one can overrule certain style aspects.
 # This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
+# standard style sheet and is therefore more robust against future updates.
 # Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra stylesheet files is of importance (e.g. the last
-# stylesheet in the list overrules the setting of the previous ones in the
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-#HTML_EXTRA_STYLESHEET  =
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1113,9 +1243,9 @@ HTML_STYLESHEET        =
 HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the stylesheet and background images according to
+# will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1144,12 +1274,24 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
 # page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: YES.
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 HTML_TIMESTAMP         = YES
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1169,17 +1311,18 @@ HTML_DYNAMIC_SECTIONS  = NO
 # Minimum value: 0, maximum value: 9999, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-#HTML_INDEX_NUM_ENTRIES = 100
+HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1218,8 +1361,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1241,28 +1384,28 @@ GENERATE_HTMLHELP      = NO
 CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
 # doxygen will try to run the HTML help compiler on the generated index.hhp.
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 HHC_LOCATION           =
 
-# The GENERATE_CHI flag controls if a separate .chi index file is generated (
-# YES) or that it should be included in the master .chm file ( NO).
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 GENERATE_CHI           = NO
 
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
 CHM_INDEX_ENCODING     =
 
-# The BINARY_TOC flag controls whether a binary table of contents is generated (
-# YES) or a normal table of contents ( NO) in the .chm file. Furthermore it
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
 # enables the Previous and Next buttons.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
@@ -1294,7 +1437,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1302,8 +1446,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1311,30 +1455,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1376,7 +1520,7 @@ DISABLE_INDEX          = NO
 # index structure (just like the one that is generated for HTML Help). For this
 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
 # (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
 # further fine-tune the look of the index. As an example, the default style
 # sheet generated by doxygen has an example that shows how to put an image at
 # the root of the tree instead of the PROJECT_NAME. Since the tree basically has
@@ -1404,13 +1548,24 @@ ENUM_VALUES_PER_LINE   = 4
 
 TREEVIEW_WIDTH         = 250
 
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
 # external symbols imported via tag files in a separate window.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1420,7 +1575,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1431,9 +1586,15 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
 # to it using the MATHJAX_RELPATH option.
@@ -1444,13 +1605,13 @@ USE_MATHJAX            = NO
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-#MATHJAX_FORMAT         = HTML-CSS
+MATHJAX_FORMAT         = HTML-CSS
 
 # When MathJax is enabled you need to specify the location relative to the HTML
 # output directory using the MATHJAX_RELPATH option. The destination directory
@@ -1459,8 +1620,8 @@ USE_MATHJAX            = NO
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
@@ -1474,11 +1635,12 @@ MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-#MATHJAX_CODEFILE       =
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1502,7 +1664,7 @@ MATHJAX_EXTENSIONS     =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1519,26 +1681,28 @@ SERVER_BASED_SEARCH    = NO
 # external search engine pointed to by the SEARCHENGINE_URL option to obtain the
 # search results.
 #
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#EXTERNAL_SEARCH        = NO
+EXTERNAL_SEARCH        = NO
 
 # The SEARCHENGINE_URL should point to a search engine hosted by a web server
 # which will return the search results when EXTERNAL_SEARCH is enabled.
 #
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#SEARCHENGINE_URL       =
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1546,7 +1710,7 @@ SERVER_BASED_SEARCH    = NO
 # The default file is: searchdata.xml.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#SEARCHDATA_FILE        = searchdata.xml
+SEARCHDATA_FILE        = searchdata.xml
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
 # EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
@@ -1554,7 +1718,7 @@ SERVER_BASED_SEARCH    = NO
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#EXTERNAL_SEARCH_ID     =
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1564,13 +1728,13 @@ SERVER_BASED_SEARCH    = NO
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-#EXTRA_SEARCH_MAPPINGS  =
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
 # The default value is: YES.
 
 GENERATE_LATEX         = YES
@@ -1586,22 +1750,36 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
-# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
 # The default value is: NO.
@@ -1619,9 +1797,12 @@ COMPACT_LATEX          = NO
 PAPER_TYPE             = a4
 
 # The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. To get the times font for
-# instance you can specify
-# EXTRA_PACKAGES=times
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1636,9 +1817,9 @@ EXTRA_PACKAGES         =
 # Note: Only use a user-defined header if you know what you are doing! The
 # following commands have a special meaning inside the header: $title,
 # $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empy string,
-# for the replacement values of the other commands the user is refered to
-# HTML_HEADER.
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
@@ -1654,13 +1835,24 @@ LATEX_HEADER           =
 
 LATEX_FOOTER           =
 
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
 # directory. Note that the files will be copied as-is; there are no commands or
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-#LATEX_EXTRA_FILES      =
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1671,9 +1863,11 @@ LATEX_FOOTER           =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1707,17 +1901,33 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BIB_STYLE        = plain
 
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
 # RTF output is optimized for Word 97 and may not look too pretty with other RTF
 # readers/editors.
 # The default value is: NO.
@@ -1732,7 +1942,7 @@ GENERATE_RTF           = NO
 
 RTF_OUTPUT             = rtf
 
-# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
 # The default value is: NO.
@@ -1752,9 +1962,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1763,17 +1973,27 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
 
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
 # classes and files.
 # The default value is: NO.
 
@@ -1802,7 +2022,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-#MAN_SUBDIR             =
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -1817,7 +2037,7 @@ MAN_LINKS              = NO
 # Configuration options related to the XML output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
@@ -1831,7 +2051,7 @@ GENERATE_XML           = YES
 
 XML_OUTPUT             = xml
 
-# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
 # listings (including syntax highlighting and cross-referencing information) to
 # the XML output. Note that enabling this will significantly increase the size
 # of the XML output.
@@ -1840,15 +2060,22 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
 # that can be used to generate PDF.
 # The default value is: NO.
 
-#GENERATE_DOCBOOK       = NO
+GENERATE_DOCBOOK       = NO
 
 # The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
 # If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
@@ -1856,25 +2083,25 @@ XML_PROGRAMLISTING     = YES
 # The default directory is: docbook.
 # This tag requires that the tag GENERATE_DOCBOOK is set to YES.
 
-#DOCBOOK_OUTPUT         = docbook
+DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES doxygen will include the
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
 # program listings (including syntax highlighting and cross-referencing
 # information) to the DOCBOOK output. Note that enabling this will significantly
 # increase the size of the DOCBOOK output.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_DOCBOOK is set to YES.
 
-#DOCBOOK_PROGRAMLISTING = NO
+DOCBOOK_PROGRAMLISTING = NO
 
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
-# Definitions (see http://autogen.sf.net) file that captures the structure of
-# the code including all documentation. Note that this feature is still
-# experimental and incomplete at the moment.
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -1883,7 +2110,7 @@ GENERATE_AUTOGEN_DEF   = NO
 # Configuration options related to the Perl module output
 #---------------------------------------------------------------------------
 
-# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
 # file that captures the structure of the code including all documentation.
 #
 # Note that this feature is still experimental and incomplete at the moment.
@@ -1891,7 +2118,7 @@ GENERATE_AUTOGEN_DEF   = NO
 
 GENERATE_PERLMOD       = NO
 
-# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
 # Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
 # output from the Perl module output.
 # The default value is: NO.
@@ -1899,9 +2126,9 @@ GENERATE_PERLMOD       = NO
 
 PERLMOD_LATEX          = NO
 
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
 # formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO the
+# understand what is going on. On the other hand, if this tag is set to NO, the
 # size of the Perl module output will be much smaller and Perl will parse it
 # just the same.
 # The default value is: YES.
@@ -1921,14 +2148,14 @@ PERLMOD_MAKEVAR_PREFIX =
 # Configuration options related to the preprocessor
 #---------------------------------------------------------------------------
 
-# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
 # C-preprocessor directives found in the sources and include files.
 # The default value is: YES.
 
 ENABLE_PREPROCESSING   = YES
 
-# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
-# in the source code. If set to NO only conditional compilation will be
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
 # performed. Macro expansion can be done in a controlled way by setting
 # EXPAND_ONLY_PREDEF to YES.
 # The default value is: NO.
@@ -1944,7 +2171,7 @@ MACRO_EXPANSION        = YES
 
 EXPAND_ONLY_PREDEF     = YES
 
-# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
 # INCLUDE_PATH will be searched if a #include is found.
 # The default value is: YES.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
@@ -1975,8 +2202,8 @@ INCLUDE_FILE_PATTERNS  =
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 PREDEFINED             = DMLC_USE_CXX11 \
-                         "XGB_DLL=" \
-                         "XGB_EXTERN_C="
+                         XGB_DLL= \
+                         XGB_EXTERN_C=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2022,37 +2249,32 @@ TAGFILES               =
 
 GENERATE_TAGFILE       =
 
-# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
-# class index. If set to NO only the inherited external classes will be listed.
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
 # The default value is: NO.
 
 ALLEXTERNALS           = NO
 
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
-# the modules index. If set to NO, only the current project's groups will be
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
 # listed.
 # The default value is: YES.
 
 EXTERNAL_GROUPS        = YES
 
-# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
 # the related pages index. If set to NO, only the current project's pages will
 # be listed.
 # The default value is: YES.
 
-#EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
+EXTERNAL_PAGES         = YES
 
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
 # (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
 # NO turns the diagrams off. Note that this option also works with HAVE_DOT
 # disabled, but it is recommended to install and use dot, since it yields more
@@ -2061,23 +2283,14 @@ PERL_PATH              = /usr/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-#DIA_PATH               =
+DIA_PATH               =
 
-# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
 # The default value is: YES.
 
@@ -2150,7 +2363,7 @@ COLLABORATION_GRAPH    = YES
 
 GROUP_GRAPHS           = YES
 
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
 # collaboration diagrams in a style similar to the OMG's Unified Modeling
 # Language.
 # The default value is: NO.
@@ -2167,9 +2380,31 @@ UML_LOOK               = YES
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-#UML_LIMIT_NUM_FIELDS   = 10
+DOT_WRAP_THRESHOLD     = 17
 
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
@@ -2202,7 +2437,8 @@ INCLUDED_BY_GRAPH      = YES
 #
 # Note that enabling this option will significantly increase the time of a run.
 # So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
 # The default value is: NO.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2213,7 +2449,8 @@ CALL_GRAPH             = NO
 #
 # Note that enabling this option will significantly increase the time of a run.
 # So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
 # The default value is: NO.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2236,13 +2473,17 @@ GRAPHICAL_HIERARCHY    = YES
 DIRECTORY_GRAPH        = YES
 
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot.
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
 # Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
 # png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
-# gif:cairo:gd, gif:gd, gif:gd:gd and svg.
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
 # The default value is: png.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
@@ -2283,16 +2524,25 @@ MSCFILE_DIRS           =
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-#DIAFILE_DIRS           =
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
 # PlantUML is not used or called during a preprocessing step. Doxygen will
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
-#PLANTUML_JAR_PATH      =
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
@@ -2330,7 +2580,7 @@ MAX_DOT_GRAPH_DEPTH    = 0
 
 DOT_TRANSPARENT        = NO
 
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10) support
 # this, this feature is disabled by default.
@@ -2347,9 +2597,11 @@ DOT_MULTI_TARGETS      = YES
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
index 6721908f9..2b476781b 100644
--- a/doc/jvm/index.rst
+++ b/doc/jvm/index.rst
@@ -41,3 +41,7 @@ Contents
   XGBoost4J Scala API <scaladocs/xgboost4j/index>
   XGBoost4J-Spark Scala API <scaladocs/xgboost4j-spark/index>
   XGBoost4J-Flink Scala API <scaladocs/xgboost4j-flink/index>
+
+.. note::
+
+  Please note that the flink interface is still under construction.
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index c33a90c81..888683975 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -23,7 +23,7 @@ Requirements
 
 Dask can be installed using either pip or conda (see the dask `installation
 documentation <https://docs.dask.org/en/latest/install.html>`_ for more information).  For
-accelerating XGBoost with GPUs, `dask-cuda <https://github.com/rapidsai/dask-cuda>`_ is
+accelerating XGBoost with GPUs, `dask-cuda <https://github.com/rapidsai/dask-cuda>`__ is
 recommended for creating GPU clusters.
 
 
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 2233336e9..e56680780 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -38,7 +38,7 @@ typedef uint64_t bst_ulong;  // NOLINT(*)
  */
 
 /**
- * @defgroup Library
+ * @defgroup Library Library
  *
  * These functions are used to obtain general information about XGBoost including version,
  * build info and current global configuration.
@@ -112,7 +112,7 @@ XGB_DLL int XGBGetGlobalConfig(char const **out_config);
 /**@}*/
 
 /**
- * @defgroup DMatrix
+ * @defgroup DMatrix DMatrix
  *
  * @brief DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms
  *        including both training, prediction and explanation. There are a few variants of
@@ -200,7 +200,7 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
-                                   bst_ulong nrow, char const *c_json_config, DMatrixHandle *out);
+                                   bst_ulong nrow, char const *config, DMatrixHandle *out);
 
 /*!
  * \brief create a matrix content from CSC format
@@ -281,7 +281,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const *
                                                   DMatrixHandle *out);
 
 /**
- * @defgroup Streaming
+ * @defgroup Streaming Streaming
  * @ingroup DMatrix
  *
  * @brief Quantile DMatrix and external memory DMatrix can be created from batches of
@@ -431,7 +431,7 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
  * - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
  * - Step 1: Create a DMatrix proxy by \ref XGProxyDMatrixCreate and hold the handle.
  * - Step 2: Pass the iterator handle, proxy handle and 2 methods into
- *           `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object.
+ *           \ref XGDMatrixCreateFromCallback, along with other parameters encoded as a JSON object.
  * - Step 3: Call appropriate data setters in `next` functions.
  *
  * \param iter    A handle to external data iterator.
@@ -830,7 +830,7 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
 /** @} */  // End of DMatrix
 
 /**
- * @defgroup Booster
+ * @defgroup Booster Booster
  *
  * @brief The `Booster` class is the gradient-boosted model for XGBoost.
  * @{
@@ -953,7 +953,7 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle d
  */
 
 /**
- * @defgroup Prediction
+ * @defgroup Prediction Prediction
  * @ingroup Booster
  *
  * @brief These functions are used for running prediction and explanation algorithms.
@@ -1155,7 +1155,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *v
 
 
 /**
- * @defgroup Serialization
+ * @defgroup Serialization Serialization
  * @ingroup Booster
  *
  * @brief There are multiple ways to serialize a Booster object depending on the use case.
@@ -1490,7 +1490,7 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
 /**@}*/  // End of Booster
 
 /**
- * @defgroup Collective
+ * @defgroup Collective Collective
  *
  * @brief Experimental support for exposing internal communicator in XGBoost.
  *

From fa267ad0939721c4e04d922e7895706b3b2ff2e7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 27 Apr 2023 22:48:31 +0800
Subject: [PATCH 27/34] [CI] Freeze R version to 4.2.0 with MSVC. (#9104)

---
 .github/workflows/r_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index 0ec95ace1..640ebce81 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -54,7 +54,7 @@ jobs:
       matrix:
         config:
           - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
-          - {os: windows-latest, r: 'release', compiler: 'msvc', build: 'cmake'}
+          - {os: windows-latest, r: '4.2.0', compiler: 'msvc', build: 'cmake'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}

From 0e470ef6062a2efa75f3c1f1305e1daae312e4d5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 28 Apr 2023 00:51:41 +0800
Subject: [PATCH 28/34] Optimize prediction with QuantileDMatrix. (#9096)

- Reduce overhead in `FVecDrop`.
- Reduce overhead caused by `HostVector()` calls.
---
 include/xgboost/tree_model.h   | 13 +++++-------
 src/data/gradient_index.cc     | 37 +++++++++++++++++++++-------------
 src/data/gradient_index.h      |  3 +++
 src/predictor/cpu_predictor.cc | 29 +++++++++++++++-----------
 src/tree/updater_refresh.cc    |  2 +-
 5 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 61dd94302..393dda59c 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -567,7 +567,7 @@ class RegTree : public Model {
      * \brief drop the trace after fill, must be called after fill.
      * \param inst The sparse instance to drop.
      */
-    void Drop(const SparsePage::Inst& inst);
+    void Drop();
     /*!
      * \brief returns the size of the feature vector
      * \return the size of the feature vector
@@ -807,13 +807,10 @@ inline void RegTree::FVec::Fill(const SparsePage::Inst& inst) {
   has_missing_ = data_.size() != feature_count;
 }
 
-inline void RegTree::FVec::Drop(const SparsePage::Inst& inst) {
-  for (auto const& entry : inst) {
-    if (entry.index >= data_.size()) {
-      continue;
-    }
-    data_[entry.index].flag = -1;
-  }
+inline void RegTree::FVec::Drop() {
+  Entry e{};
+  e.flag = -1;
+  std::fill_n(data_.data(), data_.size(), e);
   has_missing_ = true;
 }
 
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 0a606ecd5..3b3323bb5 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -166,6 +166,12 @@ float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
   auto const &values = cut.Values();
   auto const &mins = cut.MinValues();
   auto const &ptrs = cut.Ptrs();
+  return this->GetFvalue(ptrs, values, mins, ridx, fidx, is_cat);
+}
+
+float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
+                                  std::vector<float> const &values, std::vector<float> const &mins,
+                                  bst_row_t ridx, bst_feature_t fidx, bool is_cat) const {
   if (is_cat) {
     auto gidx = GetGindex(ridx, fidx);
     if (gidx == -1) {
@@ -181,24 +187,27 @@ float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
     }
     return common::HistogramCuts::NumericBinValue(ptrs, values, mins, fidx, bin_idx);
   };
-
-  if (columns_->GetColumnType(fidx) == common::kDenseColumn) {
-    if (columns_->AnyMissing()) {
+  switch (columns_->GetColumnType(fidx)) {
+    case common::kDenseColumn: {
+      if (columns_->AnyMissing()) {
+        return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
+          auto column = columns_->DenseColumn<decltype(dtype), true>(fidx);
+          return get_bin_val(column);
+        });
+      } else {
+        return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
+          auto column = columns_->DenseColumn<decltype(dtype), false>(fidx);
+          auto bin_idx = column[ridx];
+          return common::HistogramCuts::NumericBinValue(ptrs, values, mins, fidx, bin_idx);
+        });
+      }
+    }
+    case common::kSparseColumn: {
       return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-        auto column = columns_->DenseColumn<decltype(dtype), true>(fidx);
-        return get_bin_val(column);
-      });
-    } else {
-      return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-        auto column = columns_->DenseColumn<decltype(dtype), false>(fidx);
+        auto column = columns_->SparseColumn<decltype(dtype)>(fidx, 0);
         return get_bin_val(column);
       });
     }
-  } else {
-    return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
-      auto column = columns_->SparseColumn<decltype(dtype)>(fidx, 0);
-      return get_bin_val(column);
-    });
   }
 
   SPAN_CHECK(false);
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 3cb0709bd..4c35870db 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -239,6 +239,9 @@ class GHistIndexMatrix {
   bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
 
   float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+  float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
+                  std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
+                  bool is_cat) const;
 
  private:
   std::unique_ptr<common::ColumnMatrix> columns_;
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 2b7a96d9c..b3b4c5e80 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -75,7 +75,7 @@ bst_float PredValue(const SparsePage::Inst &inst,
       psum += (*trees[i])[nidx].LeafValue();
     }
   }
-  p_feats->Drop(inst);
+  p_feats->Drop();
   return psum;
 }
 
@@ -172,13 +172,11 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
   }
 }
 
-template <typename DataView>
-void FVecDrop(const size_t block_size, const size_t batch_offset, DataView *batch,
-              const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
+void FVecDrop(std::size_t const block_size, std::size_t const fvec_offset,
+              std::vector<RegTree::FVec> *p_feats) {
   for (size_t i = 0; i < block_size; ++i) {
     RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
-    const SparsePage::Inst inst = (*batch)[batch_offset + i];
-    feats.Drop(inst);
+    feats.Drop();
   }
 }
 
@@ -196,11 +194,15 @@ struct SparsePageView {
 struct GHistIndexMatrixView {
  private:
   GHistIndexMatrix const &page_;
-  uint64_t n_features_;
+  std::uint64_t const n_features_;
   common::Span<FeatureType const> ft_;
   common::Span<Entry> workspace_;
   std::vector<size_t> current_unroll_;
 
+  std::vector<std::uint32_t> const& ptrs_;
+  std::vector<float> const& mins_;
+  std::vector<float> const& values_;
+
  public:
   size_t base_rowid;
 
@@ -213,6 +215,9 @@ struct GHistIndexMatrixView {
         ft_{ft},
         workspace_{workplace},
         current_unroll_(n_threads > 0 ? n_threads : 1, 0),
+        ptrs_{_page.cut.Ptrs()},
+        mins_{_page.cut.MinValues()},
+        values_{_page.cut.Values()},
         base_rowid{_page.base_rowid} {}
 
   SparsePage::Inst operator[](size_t r) {
@@ -221,7 +226,7 @@ struct GHistIndexMatrixView {
     size_t non_missing{static_cast<std::size_t>(beg)};
 
     for (bst_feature_t c = 0; c < n_features_; ++c) {
-      float f = page_.GetFvalue(r, c, common::IsCat(ft_, c));
+      float f = page_.GetFvalue(ptrs_, values_, mins_, r, c, common::IsCat(ft_, c));
       if (!common::CheckNAN(f)) {
         workspace_[non_missing] = Entry{c, f};
         ++non_missing;
@@ -301,7 +306,7 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &mod
     // process block of rows through all trees to keep cache locality
     PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
                       fvec_offset, block_size, out_predt);
-    FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
+    FVecDrop(block_size, fvec_offset, p_thread_temp);
   });
 }
 
@@ -529,7 +534,7 @@ class ColumnSplitHelper {
 
       FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, &feat_vecs_);
       MaskAllTrees(batch_offset, fvec_offset, block_size);
-      FVecDrop(block_size, batch_offset, &batch, fvec_offset, &feat_vecs_);
+      FVecDrop(block_size, fvec_offset, &feat_vecs_);
     });
 
     AllreduceBitVectors();
@@ -780,7 +785,7 @@ class CPUPredictor : public Predictor {
           }
           preds[ridx * ntree_limit + j] = static_cast<bst_float>(nidx);
         }
-        feats.Drop(page[i]);
+        feats.Drop();
       });
     }
   }
@@ -853,7 +858,7 @@ class CPUPredictor : public Predictor {
                   (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
             }
           }
-          feats.Drop(page[i]);
+          feats.Drop();
           // add base margin to BIAS
           if (base_margin.Size() != 0) {
             CHECK_EQ(base_margin.Shape(1), ngroup);
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 17c565490..448492de0 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -79,7 +79,7 @@ class TreeRefresher : public TreeUpdater {
                      dmlc::BeginPtr(stemp[tid]) + offset);
             offset += tree->NumNodes();
           }
-          feats.Drop(inst);
+          feats.Drop();
         });
       }
       // aggregate the statistics

From e206b899ef4ed6671c80897c52b180223b7e3607 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 28 Apr 2023 02:39:12 +0800
Subject: [PATCH 29/34] Rework MAP and Pairwise for LTR. (#9075)

---
 R-package/src/Makevars.in                     |   1 -
 R-package/src/Makevars.win                    |   1 -
 doc/model.schema                              |  18 +-
 doc/parameter.rst                             |  43 +-
 python-package/xgboost/testing/__init__.py    |   7 +-
 src/metric/rank_metric.cc                     |   1 -
 src/objective/lambdarank_obj.cc               | 193 +++++
 src/objective/lambdarank_obj.cu               | 106 +++
 src/objective/lambdarank_obj.h                |  33 +
 src/objective/objective.cc                    |   2 -
 src/objective/rank_obj.cc                     |  17 -
 src/objective/rank_obj.cu                     | 789 ------------------
 tests/cpp/objective/test_lambdarank_obj.cc    | 121 +++
 tests/cpp/objective/test_lambdarank_obj.cu    |  12 +
 tests/cpp/objective/test_lambdarank_obj.h     |   4 +
 tests/cpp/objective/test_ranking_obj.cc       |  83 --
 tests/cpp/objective/test_ranking_obj_gpu.cu   | 175 ----
 tests/python/test_with_sklearn.py             |   2 +-
 .../test_with_spark/test_spark_local.py       | 139 +--
 19 files changed, 612 insertions(+), 1135 deletions(-)
 delete mode 100644 src/objective/rank_obj.cc
 delete mode 100644 src/objective/rank_obj.cu
 delete mode 100644 tests/cpp/objective/test_ranking_obj.cc
 delete mode 100644 tests/cpp/objective/test_ranking_obj_gpu.cu

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 04f0a74a5..a84459db9 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -32,7 +32,6 @@ OBJECTS= \
     $(PKGROOT)/src/objective/objective.o \
     $(PKGROOT)/src/objective/regression_obj.o \
     $(PKGROOT)/src/objective/multiclass_obj.o \
-    $(PKGROOT)/src/objective/rank_obj.o \
     $(PKGROOT)/src/objective/lambdarank_obj.o \
     $(PKGROOT)/src/objective/hinge.o \
     $(PKGROOT)/src/objective/aft_obj.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 969cb7ff4..25c577e3a 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -32,7 +32,6 @@ OBJECTS= \
     $(PKGROOT)/src/objective/objective.o \
     $(PKGROOT)/src/objective/regression_obj.o \
     $(PKGROOT)/src/objective/multiclass_obj.o \
-    $(PKGROOT)/src/objective/rank_obj.o \
     $(PKGROOT)/src/objective/lambdarank_obj.o \
     $(PKGROOT)/src/objective/hinge.o \
     $(PKGROOT)/src/objective/aft_obj.o \
diff --git a/doc/model.schema b/doc/model.schema
index b9e2da305..103d9d9e4 100644
--- a/doc/model.schema
+++ b/doc/model.schema
@@ -219,6 +219,16 @@
         "num_pairsample": { "type": "string" },
         "fix_list_weight": { "type": "string" }
       }
+    },
+    "lambdarank_param": {
+      "type": "object",
+      "properties": {
+        "lambdarank_num_pair_per_sample": { "type": "string" },
+        "lambdarank_pair_method": { "type": "string" },
+        "lambdarank_unbiased": {"type": "string" },
+        "lambdarank_bias_norm": {"type": "string" },
+        "ndcg_exp_gain": {"type": "string"}
+      }
     }
   },
   "type": "object",
@@ -477,22 +487,22 @@
               "type": "object",
               "properties": {
                 "name": { "const": "rank:pairwise" },
-                "lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
+                "lambda_rank_param": { "$ref": "#/definitions/lambdarank_param"}
               },
               "required": [
                 "name",
-                "lambda_rank_param"
+                "lambdarank_param"
               ]
             },
             {
               "type": "object",
               "properties": {
                 "name": { "const": "rank:ndcg" },
-                "lambda_rank_param": { "$ref": "#/definitions/lambda_rank_param"}
+                "lambda_rank_param": { "$ref": "#/definitions/lambdarank_param"}
               },
               "required": [
                 "name",
-                "lambda_rank_param"
+                "lambdarank_param"
               ]
             },
             {
diff --git a/doc/parameter.rst b/doc/parameter.rst
index c070e7018..8c7cadcdc 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -233,7 +233,7 @@ Parameters for Tree Booster
   .. note:: This parameter is working-in-progress.
 
   - The strategy used for training multi-target models, including multi-target regression
-  and multi-class classification. See :doc:`/tutorials/multioutput` for more information.
+    and multi-class classification. See :doc:`/tutorials/multioutput` for more information.
 
     - ``one_output_per_tree``: One model for each target.
     - ``multi_output_tree``:  Use multi-target trees.
@@ -380,9 +380,9 @@ Specify the learning task and the corresponding learning objective. The objectiv
     See :doc:`/tutorials/aft_survival_analysis` for details.
   - ``multi:softmax``: set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes)
   - ``multi:softprob``: same as softmax, but output a vector of ``ndata * nclass``, which can be further reshaped to ``ndata * nclass`` matrix. The result contains predicted probability of each data point belonging to each class.
-  - ``rank:pairwise``: Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
-  - ``rank:ndcg``: Use LambdaMART to perform list-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized
-  - ``rank:map``: Use LambdaMART to perform list-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized
+  - ``rank:ndcg``: Use LambdaMART to perform pair-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized. This objective supports position debiasing for click data.
+  - ``rank:map``: Use LambdaMART to perform pair-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized
+  - ``rank:pairwise``: Use LambdaRank to perform pair-wise ranking using the `ranknet` objective.
   - ``reg:gamma``: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications>`_.
   - ``reg:tweedie``: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be `Tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications>`_.
 
@@ -395,8 +395,9 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
 * ``eval_metric`` [default according to objective]
 
-  - Evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and logloss for classification, mean average precision for ranking)
-  - User can add multiple evaluation metrics. Python users: remember to pass the metrics in as list of parameters pairs instead of map, so that latter ``eval_metric`` won't override previous one
+  - Evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and logloss for classification, `mean average precision` for ``rank:map``, etc.)
+  - User can add multiple evaluation metrics. Python users: remember to pass the metrics in as list of parameters pairs instead of map, so that latter ``eval_metric`` won't override previous ones
+
   - The choices are listed below:
 
     - ``rmse``: `root mean square error <http://en.wikipedia.org/wiki/Root_mean_square_error>`_
@@ -480,6 +481,36 @@ Parameter for using AFT Survival Loss (``survival:aft``) and Negative Log Likeli
 
 * ``aft_loss_distribution``: Probability Density Function, ``normal``, ``logistic``, or ``extreme``.
 
+.. _ltr-param:
+
+Parameters for learning to rank (``rank:ndcg``, ``rank:map``, ``rank:pairwise``)
+================================================================================
+
+These are parameters specific to learning to rank task. See :doc:`Learning to Rank </tutorials/learning_to_rank>` for an in-depth explanation.
+
+* ``lambdarank_pair_method`` [default = ``mean``]
+
+  How to construct pairs for pair-wise learning.
+
+  - ``mean``: Sample ``lambdarank_num_pair_per_sample`` pairs for each document in the query list.
+  - ``topk``: Focus on top-``lambdarank_num_pair_per_sample`` documents. Construct :math:`|query|` pairs for each document at the top-``lambdarank_num_pair_per_sample`` ranked by the model.
+
+* ``lambdarank_num_pair_per_sample`` [range = :math:`[1, \infty]`]
+
+  It specifies the number of pairs sampled for each document when pair method is ``mean``, or the truncation level for queries when the pair method is ``topk``. For example, to train with ``ndcg@6``, set ``lambdarank_num_pair_per_sample`` to :math:`6` and ``lambdarank_pair_method`` to ``topk``.
+
+* ``lambdarank_unbiased`` [default = ``false``]
+
+  Specify whether do we need to debias input click data.
+
+* ``lambdarank_bias_norm`` [default = 2.0]
+
+  :math:`L_p` normalization for position debiasing, default is :math:`L_2`. Only relevant when ``lambdarank_unbiased`` is set to true.
+
+* ``ndcg_exp_gain`` [default = ``true``]
+
+  Whether we should use exponential gain function for ``NDCG``. There are two forms of gain function for ``NDCG``, one is using relevance value directly while the other is using :math:`2^{rel} - 1` to emphasize on retrieving relevant documents. When ``ndcg_exp_gain`` is true (the default), relevance degree cannot be greater than 31.
+
 ***********************
 Command Line Parameters
 ***********************
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 20a4c681e..5566e0b2d 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -431,8 +431,11 @@ def make_ltr(
     """Make a dataset for testing LTR."""
     rng = np.random.default_rng(1994)
     X = rng.normal(0, 1.0, size=n_samples * n_features).reshape(n_samples, n_features)
-    y = rng.integers(0, max_rel, size=n_samples)
-    qid = rng.integers(0, n_query_groups, size=n_samples)
+    y = np.sum(X, axis=1)
+    y -= y.min()
+    y = np.round(y / y.max() * max_rel).astype(np.int32)
+
+    qid = rng.integers(0, n_query_groups, size=n_samples, dtype=np.int32)
     w = rng.normal(0, 1.0, size=n_query_groups)
     w -= np.min(w)
     w /= np.max(w)
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 4f272e939..c4549458d 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -493,7 +493,6 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
     auto rank_idx = p_cache->SortedIdx(ctx_, predt.ConstHostSpan());
 
     common::ParallelFor(p_cache->Groups(), ctx_->Threads(), [&](auto g) {
-      auto g_predt = h_predt.Slice(linalg::Range(gptr[g], gptr[g + 1]));
       auto g_label = h_label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
       auto g_rank = rank_idx.subspan(gptr[g]);
 
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index 30957f81a..d0ff5bda5 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -69,6 +69,7 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
       lj(i) += g_lj(i);
     }
   }
+
   // The ti+ is not guaranteed to decrease since it depends on the |\delta Z|
   //
   // The update normalizes the ti+ to make ti+(0) equal to 1, which breaks the probability
@@ -432,9 +433,201 @@ void LambdaRankUpdatePositionBias(Context const*, linalg::VectorView<double cons
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace cuda_impl
 
+namespace cpu_impl {
+void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
+             common::Span<std::size_t const> rank_idx, std::shared_ptr<ltr::MAPCache> p_cache) {
+  auto h_n_rel = p_cache->NumRelevant(ctx);
+  auto gptr = p_cache->DataGroupPtr(ctx);
+
+  CHECK_EQ(h_n_rel.size(), gptr.back());
+  CHECK_EQ(h_n_rel.size(), label.Size());
+
+  auto h_acc = p_cache->Acc(ctx);
+
+  common::ParallelFor(p_cache->Groups(), ctx->Threads(), [&](auto g) {
+    auto cnt = gptr[g + 1] - gptr[g];
+    auto g_n_rel = h_n_rel.subspan(gptr[g], cnt);
+    auto g_rank = rank_idx.subspan(gptr[g], cnt);
+    auto g_label = label.Slice(linalg::Range(gptr[g], gptr[g + 1]));
+
+    // The number of relevant documents at each position
+    g_n_rel[0] = g_label(g_rank[0]);
+    for (std::size_t k = 1; k < g_rank.size(); ++k) {
+      g_n_rel[k] = g_n_rel[k - 1] + g_label(g_rank[k]);
+    }
+
+    // \sum l_k/k
+    auto g_acc = h_acc.subspan(gptr[g], cnt);
+    g_acc[0] = g_label(g_rank[0]) / 1.0;
+
+    for (std::size_t k = 1; k < g_rank.size(); ++k) {
+      g_acc[k] = g_acc[k - 1] + (g_label(g_rank[k]) / static_cast<double>(k + 1));
+    }
+  });
+}
+}  // namespace cpu_impl
+
+class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
+ public:
+  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
+                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
+    if (ctx_->IsCUDA()) {
+      return cuda_impl::LambdaRankGetGradientMAP(
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
+          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
+          out_gpair);
+    }
+
+    auto gptr = p_cache_->DataGroupPtr(ctx_).data();
+    bst_group_t n_groups = p_cache_->Groups();
+
+    out_gpair->Resize(info.num_row_);
+    auto h_gpair = out_gpair->HostSpan();
+    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
+    auto h_predt = predt.ConstHostSpan();
+    auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
+    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+
+    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+
+    cpu_impl::MAPStat(ctx_, h_label, rank_idx, GetCache());
+    auto n_rel = GetCache()->NumRelevant(ctx_);
+    auto acc = GetCache()->Acc(ctx_);
+
+    auto delta_map = [&](auto y_high, auto y_low, std::size_t rank_high, std::size_t rank_low,
+                         bst_group_t g) {
+      if (rank_high > rank_low) {
+        std::swap(rank_high, rank_low);
+        std::swap(y_high, y_low);
+      }
+      auto cnt = gptr[g + 1] - gptr[g];
+      // In a hot loop
+      auto g_n_rel = common::Span<double const>{n_rel.data() + gptr[g], cnt};
+      auto g_acc = common::Span<double const>{acc.data() + gptr[g], cnt};
+      auto d = DeltaMAP(y_high, y_low, rank_high, rank_low, g_n_rel, g_acc);
+      return d;
+    };
+    using D = decltype(delta_map);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
+      auto cnt = gptr[g + 1] - gptr[g];
+      auto w = h_weight[g];
+      auto g_predt = h_predt.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_label = h_label.Slice(make_range(g));
+      auto g_rank = rank_idx.subspan(gptr[g], cnt);
+
+      auto args = std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g, delta_map, g_gpair);
+
+      if (param_.lambdarank_unbiased) {
+        std::apply(&LambdaRankMAP::CalcLambdaForGroup<true, D>, args);
+      } else {
+        std::apply(&LambdaRankMAP::CalcLambdaForGroup<false, D>, args);
+      }
+    });
+  }
+  static char const* Name() { return "rank:map"; }
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
+    return this->RankEvalMetric("map");
+  }
+};
+
+#if !defined(XGBOOST_USE_CUDA)
+namespace cuda_impl {
+void MAPStat(Context const*, MetaInfo const&, common::Span<std::size_t const>,
+             std::shared_ptr<ltr::MAPCache>) {
+  common::AssertGPUSupport();
+}
+
+void LambdaRankGetGradientMAP(Context const*, std::int32_t, HostDeviceVector<float> const&,
+                              const MetaInfo&, std::shared_ptr<ltr::MAPCache>,
+                              linalg::VectorView<double const>,  // input bias ratio
+                              linalg::VectorView<double const>,  // input bias ratio
+                              linalg::VectorView<double>, linalg::VectorView<double>,
+                              HostDeviceVector<GradientPair>*) {
+  common::AssertGPUSupport();
+}
+}  // namespace cuda_impl
+#endif  // !defined(XGBOOST_USE_CUDA)
+
+/**
+ * \brief The RankNet loss.
+ */
+class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::RankingCache> {
+ public:
+  void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
+                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
+    if (ctx_->IsCUDA()) {
+      return cuda_impl::LambdaRankGetGradientPairwise(
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
+          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
+          out_gpair);
+    }
+
+    auto gptr = p_cache_->DataGroupPtr(ctx_);
+    bst_group_t n_groups = p_cache_->Groups();
+
+    out_gpair->Resize(info.num_row_);
+    auto h_gpair = out_gpair->HostSpan();
+    auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
+    auto h_predt = predt.ConstHostSpan();
+    auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
+
+    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+    auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
+
+    auto delta = [](auto...) { return 1.0; };
+    using D = decltype(delta);
+
+    common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
+      auto cnt = gptr[g + 1] - gptr[g];
+      auto w = h_weight[g];
+      auto g_predt = h_predt.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_label = h_label.Slice(make_range(g));
+      auto g_rank = rank_idx.subspan(gptr[g], cnt);
+
+      auto args = std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);
+      if (param_.lambdarank_unbiased) {
+        std::apply(&LambdaRankPairwise::CalcLambdaForGroup<true, D>, args);
+      } else {
+        std::apply(&LambdaRankPairwise::CalcLambdaForGroup<false, D>, args);
+      }
+    });
+  }
+
+  static char const* Name() { return "rank:pairwise"; }
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
+    return this->RankEvalMetric("ndcg");
+  }
+};
+
+#if !defined(XGBOOST_USE_CUDA)
+namespace cuda_impl {
+void LambdaRankGetGradientPairwise(Context const*, std::int32_t, HostDeviceVector<float> const&,
+                                   const MetaInfo&, std::shared_ptr<ltr::RankingCache>,
+                                   linalg::VectorView<double const>,  // input bias ratio
+                                   linalg::VectorView<double const>,  // input bias ratio
+                                   linalg::VectorView<double>, linalg::VectorView<double>,
+                                   HostDeviceVector<GradientPair>*) {
+  common::AssertGPUSupport();
+}
+}  // namespace cuda_impl
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 XGBOOST_REGISTER_OBJECTIVE(LambdaRankNDCG, LambdaRankNDCG::Name())
     .describe("LambdaRank with NDCG loss as objective")
     .set_body([]() { return new LambdaRankNDCG{}; });
 
+XGBOOST_REGISTER_OBJECTIVE(LambdaRankPairwise, LambdaRankPairwise::Name())
+    .describe("LambdaRank with RankNet loss as objective")
+    .set_body([]() { return new LambdaRankPairwise{}; });
+
+XGBOOST_REGISTER_OBJECTIVE(LambdaRankMAP, LambdaRankMAP::Name())
+    .describe("LambdaRank with MAP loss as objective.")
+    .set_body([]() { return new LambdaRankMAP{}; });
+
 DMLC_REGISTRY_FILE_TAG(lambdarank_obj);
 }  // namespace xgboost::obj
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index 27b5872a8..110e4ae87 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -390,6 +390,112 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
   Launch(ctx, iter, preds, info, p_cache, delta_ndcg, ti_plus, tj_minus, li, lj, out_gpair);
 }
 
+void MAPStat(Context const* ctx, MetaInfo const& info, common::Span<std::size_t const> d_rank_idx,
+             std::shared_ptr<ltr::MAPCache> p_cache) {
+  common::Span<double> out_n_rel = p_cache->NumRelevant(ctx);
+  common::Span<double> out_acc = p_cache->Acc(ctx);
+
+  CHECK_EQ(out_n_rel.size(), info.num_row_);
+  CHECK_EQ(out_acc.size(), info.num_row_);
+
+  auto group_ptr = p_cache->DataGroupPtr(ctx);
+  auto key_it = dh::MakeTransformIterator<std::size_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(std::size_t i) -> std::size_t { return dh::SegmentId(group_ptr, i); });
+  auto label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto const* cuctx = ctx->CUDACtx();
+
+  {
+    // calculate number of relevant documents
+    auto val_it = dh::MakeTransformIterator<double>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> double {
+          auto g = dh::SegmentId(group_ptr, i);
+          auto g_label = label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
+          auto idx_in_group = i - group_ptr[g];
+          auto g_sorted_idx = d_rank_idx.subspan(group_ptr[g], group_ptr[g + 1] - group_ptr[g]);
+          return static_cast<double>(g_label(g_sorted_idx[idx_in_group]));
+        });
+    thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + info.num_row_, val_it,
+                                  out_n_rel.data());
+  }
+  {
+    // \sum l_k/k
+    auto val_it = dh::MakeTransformIterator<double>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> double {
+          auto g = dh::SegmentId(group_ptr, i);
+          auto g_label = label.Slice(linalg::Range(group_ptr[g], group_ptr[g + 1]));
+          auto g_sorted_idx = d_rank_idx.subspan(group_ptr[g], group_ptr[g + 1] - group_ptr[g]);
+          auto idx_in_group = i - group_ptr[g];
+          double rank_in_group = idx_in_group + 1.0;
+          return static_cast<double>(g_label(g_sorted_idx[idx_in_group])) / rank_in_group;
+        });
+    thrust::inclusive_scan_by_key(cuctx->CTP(), key_it, key_it + info.num_row_, val_it,
+                                  out_acc.data());
+  }
+}
+
+void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
+                              HostDeviceVector<float> const& predt, const MetaInfo& info,
+                              std::shared_ptr<ltr::MAPCache> p_cache,
+                              linalg::VectorView<double const> ti_plus,   // input bias ratio
+                              linalg::VectorView<double const> tj_minus,  // input bias ratio
+                              linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                              HostDeviceVector<GradientPair>* out_gpair) {
+  std::int32_t device_id = ctx->gpu_id;
+  dh::safe_cuda(cudaSetDevice(device_id));
+
+  info.labels.SetDevice(device_id);
+  predt.SetDevice(device_id);
+
+  CHECK(p_cache);
+
+  auto d_predt = predt.ConstDeviceSpan();
+  auto const d_sorted_idx = p_cache->SortedIdx(ctx, d_predt);
+
+  MAPStat(ctx, info, d_sorted_idx, p_cache);
+  auto d_n_rel = p_cache->NumRelevant(ctx);
+  auto d_acc = p_cache->Acc(ctx);
+  auto d_gptr = p_cache->DataGroupPtr(ctx).data();
+
+  auto delta_map = [=] XGBOOST_DEVICE(float y_high, float y_low, std::size_t rank_high,
+                                      std::size_t rank_low, bst_group_t g) {
+    if (rank_high > rank_low) {
+      thrust::swap(rank_high, rank_low);
+      thrust::swap(y_high, y_low);
+    }
+    auto cnt = d_gptr[g + 1] - d_gptr[g];
+    auto g_n_rel = d_n_rel.subspan(d_gptr[g], cnt);
+    auto g_acc = d_acc.subspan(d_gptr[g], cnt);
+    auto d = DeltaMAP(y_high, y_low, rank_high, rank_low, g_n_rel, g_acc);
+    return d;
+  };
+
+  Launch(ctx, iter, predt, info, p_cache, delta_map, ti_plus, tj_minus, li, lj, out_gpair);
+}
+
+void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
+                                   HostDeviceVector<float> const& predt, const MetaInfo& info,
+                                   std::shared_ptr<ltr::RankingCache> p_cache,
+                                   linalg::VectorView<double const> ti_plus,   // input bias ratio
+                                   linalg::VectorView<double const> tj_minus,  // input bias ratio
+                                   linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                                   HostDeviceVector<GradientPair>* out_gpair) {
+  std::int32_t device_id = ctx->gpu_id;
+  dh::safe_cuda(cudaSetDevice(device_id));
+
+  info.labels.SetDevice(device_id);
+  predt.SetDevice(device_id);
+
+  auto d_predt = predt.ConstDeviceSpan();
+  auto const d_sorted_idx = p_cache->SortedIdx(ctx, d_predt);
+
+  auto delta = [] XGBOOST_DEVICE(float, float, std::size_t, std::size_t, bst_group_t) {
+    return 1.0;
+  };
+
+  Launch(ctx, iter, predt, info, p_cache, delta, ti_plus, tj_minus, li, lj, out_gpair);
+}
+
 namespace {
 struct ReduceOp {
   template <typename Tup>
diff --git a/src/objective/lambdarank_obj.h b/src/objective/lambdarank_obj.h
index 0eb06e27c..c2222c028 100644
--- a/src/objective/lambdarank_obj.h
+++ b/src/objective/lambdarank_obj.h
@@ -156,6 +156,27 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
                                linalg::VectorView<double> li, linalg::VectorView<double> lj,
                                HostDeviceVector<GradientPair>* out_gpair);
 
+/**
+ * \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
+ */
+void MAPStat(Context const* ctx, MetaInfo const& info, common::Span<std::size_t const> d_rank_idx,
+             std::shared_ptr<ltr::MAPCache> p_cache);
+
+void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
+                              HostDeviceVector<float> const& predt, MetaInfo const& info,
+                              std::shared_ptr<ltr::MAPCache> p_cache,
+                              linalg::VectorView<double const> t_plus,   // input bias ratio
+                              linalg::VectorView<double const> t_minus,  // input bias ratio
+                              linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                              HostDeviceVector<GradientPair>* out_gpair);
+
+void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
+                                   HostDeviceVector<float> const& predt, const MetaInfo& info,
+                                   std::shared_ptr<ltr::RankingCache> p_cache,
+                                   linalg::VectorView<double const> ti_plus,   // input bias ratio
+                                   linalg::VectorView<double const> tj_minus,  // input bias ratio
+                                   linalg::VectorView<double> li, linalg::VectorView<double> lj,
+                                   HostDeviceVector<GradientPair>* out_gpair);
 
 void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
                                   linalg::VectorView<double const> lj_full,
@@ -165,6 +186,18 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
                                   std::shared_ptr<ltr::RankingCache> p_cache);
 }  // namespace cuda_impl
 
+namespace cpu_impl {
+/**
+ * \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
+ *
+ * \param label    Ground truth relevance label.
+ * \param rank_idx Sorted index of prediction.
+ * \param p_cache  An initialized MAPCache.
+ */
+void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
+             common::Span<std::size_t const> rank_idx, std::shared_ptr<ltr::MAPCache> p_cache);
+}  // namespace cpu_impl
+
 /**
  * \param Construct pairs on CPU
  *
diff --git a/src/objective/objective.cc b/src/objective/objective.cc
index 7d2c37811..85cd9803d 100644
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -47,7 +47,6 @@ DMLC_REGISTRY_LINK_TAG(regression_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(quantile_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(hinge_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj_gpu);
-DMLC_REGISTRY_LINK_TAG(rank_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(lambdarank_obj);
 DMLC_REGISTRY_LINK_TAG(lambdarank_obj_cu);
 #else
@@ -55,7 +54,6 @@ DMLC_REGISTRY_LINK_TAG(regression_obj);
 DMLC_REGISTRY_LINK_TAG(quantile_obj);
 DMLC_REGISTRY_LINK_TAG(hinge_obj);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj);
-DMLC_REGISTRY_LINK_TAG(rank_obj);
 DMLC_REGISTRY_LINK_TAG(lambdarank_obj);
 #endif  // XGBOOST_USE_CUDA
 }  // namespace obj
diff --git a/src/objective/rank_obj.cc b/src/objective/rank_obj.cc
deleted file mode 100644
index 25cd9e643..000000000
--- a/src/objective/rank_obj.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-/*!
- * Copyright 2019 XGBoost contributors
- */
-
-// Dummy file to keep the CUDA conditional compile trick.
-#include <dmlc/registry.h>
-namespace xgboost {
-namespace obj {
-
-DMLC_REGISTRY_FILE_TAG(rank_obj);
-
-}  // namespace obj
-}  // namespace xgboost
-
-#ifndef XGBOOST_USE_CUDA
-#include "rank_obj.cu"
-#endif  // XGBOOST_USE_CUDA
diff --git a/src/objective/rank_obj.cu b/src/objective/rank_obj.cu
deleted file mode 100644
index 23613d93d..000000000
--- a/src/objective/rank_obj.cu
+++ /dev/null
@@ -1,789 +0,0 @@
-/*!
- * Copyright 2015-2022 XGBoost contributors
- */
-#include <dmlc/omp.h>
-#include <dmlc/timer.h>
-#include <xgboost/logging.h>
-#include <xgboost/objective.h>
-#include <vector>
-#include <algorithm>
-#include <utility>
-
-#include "xgboost/json.h"
-#include "xgboost/parameter.h"
-
-#include "../common/math.h"
-#include "../common/random.h"
-
-#if defined(__CUDACC__)
-#include <thrust/sort.h>
-#include <thrust/gather.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/random/uniform_int_distribution.h>
-#include <thrust/random/linear_congruential_engine.h>
-
-#include <cub/util_allocator.cuh>
-
-#include "../common/device_helpers.cuh"
-#endif
-
-namespace xgboost {
-namespace obj {
-
-#if defined(XGBOOST_USE_CUDA) && !defined(GTEST_TEST)
-DMLC_REGISTRY_FILE_TAG(rank_obj_gpu);
-#endif  // defined(XGBOOST_USE_CUDA)
-
-struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
-  size_t num_pairsample;
-  float fix_list_weight;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(LambdaRankParam) {
-    DMLC_DECLARE_FIELD(num_pairsample).set_lower_bound(1).set_default(1)
-        .describe("Number of pair generated for each instance.");
-    DMLC_DECLARE_FIELD(fix_list_weight).set_lower_bound(0.0f).set_default(0.0f)
-        .describe("Normalize the weight of each list by this value,"
-                  " if equals 0, no effect will happen");
-  }
-};
-
-#if defined(__CUDACC__)
-// Helper functions
-
-template <typename T>
-XGBOOST_DEVICE __forceinline__ uint32_t
-CountNumItemsToTheLeftOf(const T *__restrict__ items, uint32_t n, T v) {
-  return thrust::lower_bound(thrust::seq, items, items + n, v,
-                             thrust::greater<T>()) -
-         items;
-}
-
-template <typename T>
-XGBOOST_DEVICE __forceinline__ uint32_t
-CountNumItemsToTheRightOf(const T *__restrict__ items, uint32_t n, T v) {
-  return n - (thrust::upper_bound(thrust::seq, items, items + n, v,
-                                  thrust::greater<T>()) -
-              items);
-}
-#endif
-
-/*! \brief helper information in a list */
-struct ListEntry {
-  /*! \brief the predict score we in the data */
-  bst_float pred;
-  /*! \brief the actual label of the entry */
-  bst_float label;
-  /*! \brief row index in the data matrix */
-  unsigned rindex;
-  // constructor
-  ListEntry(bst_float pred, bst_float label, unsigned rindex)
-    : pred(pred), label(label), rindex(rindex) {}
-  // comparator by prediction
-  inline static bool CmpPred(const ListEntry &a, const ListEntry &b) {
-    return a.pred > b.pred;
-  }
-  // comparator by label
-  inline static bool CmpLabel(const ListEntry &a, const ListEntry &b) {
-    return a.label > b.label;
-  }
-};
-
-/*! \brief a pair in the lambda rank */
-struct LambdaPair {
-  /*! \brief positive index: this is a position in the list */
-  unsigned pos_index;
-  /*! \brief negative index: this is a position in the list */
-  unsigned neg_index;
-  /*! \brief weight to be filled in */
-  bst_float weight;
-  // constructor
-  LambdaPair(unsigned pos_index, unsigned neg_index)
-    : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {}
-  // constructor
-  LambdaPair(unsigned pos_index, unsigned neg_index, bst_float weight)
-    : pos_index(pos_index), neg_index(neg_index), weight(weight) {}
-};
-
-class PairwiseLambdaWeightComputer {
- public:
-  /*!
-   * \brief get lambda weight for existing pairs - for pairwise objective
-   * \param list a list that is sorted by pred score
-   * \param io_pairs record of pairs, containing the pairs to fill in weights
-   */
-  static void GetLambdaWeight(const std::vector<ListEntry>&,
-                              std::vector<LambdaPair>*) {}
-
-  static char const* Name() {
-    return "rank:pairwise";
-  }
-
-#if defined(__CUDACC__)
-  PairwiseLambdaWeightComputer(const bst_float*,
-                               const bst_float*,
-                               const dh::SegmentSorter<float>&) {}
-
-  class PairwiseLambdaWeightMultiplier {
-   public:
-    // Adjust the items weight by this value
-    __device__ __forceinline__ bst_float GetWeight(uint32_t gidx, int pidx, int nidx) const {
-      return 1.0f;
-    }
-  };
-
-  inline const PairwiseLambdaWeightMultiplier GetWeightMultiplier() const {
-    return {};
-  }
-#endif
-};
-
-#if defined(__CUDACC__)
-class BaseLambdaWeightMultiplier {
- public:
-  BaseLambdaWeightMultiplier(const dh::SegmentSorter<float> &segment_label_sorter,
-                             const dh::SegmentSorter<float> &segment_pred_sorter)
-    : dsorted_labels_(segment_label_sorter.GetItemsSpan()),
-      dorig_pos_(segment_label_sorter.GetOriginalPositionsSpan()),
-      dgroups_(segment_label_sorter.GetGroupsSpan()),
-      dindexable_sorted_preds_pos_(segment_pred_sorter.GetIndexableSortedPositionsSpan()) {}
-
- protected:
-  const common::Span<const float> dsorted_labels_;  // Labels sorted within a group
-  const common::Span<const uint32_t> dorig_pos_;  // Original indices of the labels
-                                                  // before they are sorted
-  const common::Span<const uint32_t> dgroups_;  // The group indices
-  // Where can a prediction for a label be found in the original array, when they are sorted
-  const common::Span<const uint32_t> dindexable_sorted_preds_pos_;
-};
-
-// While computing the weight that needs to be adjusted by this ranking objective, we need
-// to figure out where positive and negative labels chosen earlier exists, if the group
-// were to be sorted by its predictions. To accommodate this, we employ the following algorithm.
-// For a given group, let's assume the following:
-// labels:        1 5 9 2 4 8 0 7 6 3
-// predictions:   1 9 0 8 2 7 3 6 5 4
-// position:      0 1 2 3 4 5 6 7 8 9
-//
-// After label sort:
-// labels:        9 8 7 6 5 4 3 2 1 0
-// position:      2 5 7 8 1 4 9 3 0 6
-//
-// After prediction sort:
-// predictions:   9 8 7 6 5 4 3 2 1 0
-// position:      1 3 5 7 8 9 6 4 0 2
-//
-// If a sorted label at position 'x' is chosen, then we need to find out where the prediction
-// for this label 'x' exists, if the group were to be sorted by predictions.
-// We first take the sorted prediction positions:
-// position:      1 3 5 7 8 9 6 4 0 2
-// at indices:    0 1 2 3 4 5 6 7 8 9
-//
-// We create a sorted prediction positional array, such that value at position 'x' gives
-// us the position in the sorted prediction array where its related prediction lies.
-// dindexable_sorted_preds_pos_:  8 0 9 1 7 2 6 3 4 5
-// at indices:                    0 1 2 3 4 5 6 7 8 9
-// Basically, swap the previous 2 arrays, sort the indices and reorder positions
-// for an O(1) lookup using the position where the sorted label exists.
-//
-// This type does that using the SegmentSorter
-class IndexablePredictionSorter {
- public:
-  IndexablePredictionSorter(const bst_float *dpreds,
-                            const dh::SegmentSorter<float> &segment_label_sorter) {
-    // Sort the predictions first
-    segment_pred_sorter_.SortItems(dpreds, segment_label_sorter.GetNumItems(),
-                                   segment_label_sorter.GetGroupSegmentsSpan());
-
-    // Create an index for the sorted prediction positions
-    segment_pred_sorter_.CreateIndexableSortedPositions();
-  }
-
-  inline const dh::SegmentSorter<float> &GetPredictionSorter() const {
-    return segment_pred_sorter_;
-  }
-
- private:
-  dh::SegmentSorter<float> segment_pred_sorter_;  // For sorting the predictions
-};
-#endif
-
-class MAPLambdaWeightComputer
-#if defined(__CUDACC__)
-  : public IndexablePredictionSorter
-#endif
-{
- public:
-  struct MAPStats {
-    /*! \brief the accumulated precision */
-    float ap_acc{0.0f};
-    /*!
-     * \brief the accumulated precision,
-     *   assuming a positive instance is missing
-     */
-    float ap_acc_miss{0.0f};
-    /*!
-     * \brief the accumulated precision,
-     * assuming that one more positive instance is inserted ahead
-     */
-    float ap_acc_add{0.0f};
-    /* \brief the accumulated positive instance count */
-    float hits{0.0f};
-
-    XGBOOST_DEVICE MAPStats() {}  // NOLINT
-    XGBOOST_DEVICE MAPStats(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits)
-      : ap_acc(ap_acc), ap_acc_miss(ap_acc_miss), ap_acc_add(ap_acc_add), hits(hits) {}
-
-    // For prefix scan
-    XGBOOST_DEVICE MAPStats operator +(const MAPStats &v1) const {
-      return {ap_acc + v1.ap_acc, ap_acc_miss + v1.ap_acc_miss,
-              ap_acc_add + v1.ap_acc_add, hits + v1.hits};
-    }
-
-    // For test purposes - compare for equality
-    XGBOOST_DEVICE bool operator ==(const MAPStats &rhs) const {
-      return ap_acc == rhs.ap_acc && ap_acc_miss == rhs.ap_acc_miss &&
-             ap_acc_add == rhs.ap_acc_add && hits == rhs.hits;
-    }
-  };
-
- private:
-  template <typename T>
-  XGBOOST_DEVICE inline static void Swap(T &v0, T &v1) {
-#if defined(__CUDACC__)
-    thrust::swap(v0, v1);
-#else
-    std::swap(v0, v1);
-#endif
-  }
-
-  /*!
-   * \brief Obtain the delta MAP by trying to switch the positions of labels in pos_pred_pos or
-   *        neg_pred_pos when sorted by predictions
-   * \param pos_pred_pos positive label's prediction value position when the groups prediction
-   *        values are sorted
-   * \param neg_pred_pos negative label's prediction value position when the groups prediction
-   *        values are sorted
-   * \param pos_label, neg_label the chosen positive and negative labels
-   * \param p_map_stats a vector containing the accumulated precisions for each position in a list
-   * \param map_stats_size size of the accumulated precisions vector
-   */
-  XGBOOST_DEVICE inline static bst_float GetLambdaMAP(
-    int pos_pred_pos, int neg_pred_pos,
-    bst_float pos_label, bst_float neg_label,
-    const MAPStats *p_map_stats, uint32_t map_stats_size) {
-    if (pos_pred_pos == neg_pred_pos || p_map_stats[map_stats_size - 1].hits == 0) {
-      return 0.0f;
-    }
-    if (pos_pred_pos > neg_pred_pos) {
-      Swap(pos_pred_pos, neg_pred_pos);
-      Swap(pos_label, neg_label);
-    }
-    bst_float original = p_map_stats[neg_pred_pos].ap_acc;
-    if (pos_pred_pos != 0) original -= p_map_stats[pos_pred_pos - 1].ap_acc;
-    bst_float changed = 0;
-    bst_float label1 = pos_label > 0.0f ? 1.0f : 0.0f;
-    bst_float label2 = neg_label > 0.0f ? 1.0f : 0.0f;
-    if (label1 == label2) {
-      return 0.0;
-    } else if (label1 < label2) {
-      changed += p_map_stats[neg_pred_pos - 1].ap_acc_add - p_map_stats[pos_pred_pos].ap_acc_add;
-      changed += (p_map_stats[pos_pred_pos].hits + 1.0f) / (pos_pred_pos + 1);
-    } else {
-      changed += p_map_stats[neg_pred_pos - 1].ap_acc_miss - p_map_stats[pos_pred_pos].ap_acc_miss;
-      changed += p_map_stats[neg_pred_pos].hits / (neg_pred_pos + 1);
-    }
-    bst_float ans = (changed - original) / (p_map_stats[map_stats_size - 1].hits);
-    if (ans < 0) ans = -ans;
-    return ans;
-  }
-
- public:
-  /*
-   * \brief obtain preprocessing results for calculating delta MAP
-   * \param sorted_list the list containing entry information
-   * \param map_stats a vector containing the accumulated precisions for each position in a list
-   */
-  inline static void GetMAPStats(const std::vector<ListEntry> &sorted_list,
-                                 std::vector<MAPStats> *p_map_acc) {
-    std::vector<MAPStats> &map_acc = *p_map_acc;
-    map_acc.resize(sorted_list.size());
-    bst_float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0;
-    for (size_t i = 1; i <= sorted_list.size(); ++i) {
-      if (sorted_list[i - 1].label > 0.0f) {
-        hit++;
-        acc1 += hit / i;
-        acc2 += (hit - 1) / i;
-        acc3 += (hit + 1) / i;
-      }
-      map_acc[i - 1] = MAPStats(acc1, acc2, acc3, hit);
-    }
-  }
-
-  static char const* Name() {
-    return "rank:map";
-  }
-
-  static void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                              std::vector<LambdaPair> *io_pairs) {
-    std::vector<LambdaPair> &pairs = *io_pairs;
-    std::vector<MAPStats> map_stats;
-    GetMAPStats(sorted_list, &map_stats);
-    for (auto & pair : pairs) {
-      pair.weight *=
-        GetLambdaMAP(pair.pos_index, pair.neg_index,
-                     sorted_list[pair.pos_index].label, sorted_list[pair.neg_index].label,
-                     &map_stats[0], map_stats.size());
-    }
-  }
-
-#if defined(__CUDACC__)
-  MAPLambdaWeightComputer(const bst_float *dpreds,
-                          const bst_float *dlabels,
-                          const dh::SegmentSorter<float> &segment_label_sorter)
-    : IndexablePredictionSorter(dpreds, segment_label_sorter),
-      dmap_stats_(segment_label_sorter.GetNumItems(), MAPStats()),
-      weight_multiplier_(segment_label_sorter, *this) {
-    this->CreateMAPStats(dlabels, segment_label_sorter);
-  }
-
-  void CreateMAPStats(const bst_float *dlabels,
-                      const dh::SegmentSorter<float> &segment_label_sorter) {
-    // For each group, go through the sorted prediction positions, and look up its corresponding
-    // label from the unsorted labels (from the original label list)
-
-    // For each item in the group, compute its MAP stats.
-    // Interleave the computation of map stats amongst different groups.
-
-    // First, determine postive labels in the dataset individually
-    auto nitems = segment_label_sorter.GetNumItems();
-    dh::caching_device_vector<uint32_t> dhits(nitems, 0);
-    // Original positions of the predictions after they have been sorted
-    const auto &pred_original_pos = this->GetPredictionSorter().GetOriginalPositionsSpan();
-    // Unsorted labels
-    const float *unsorted_labels = dlabels;
-    auto DeterminePositiveLabelLambda = [=] __device__(uint32_t idx) {
-      return (unsorted_labels[pred_original_pos[idx]] > 0.0f) ? 1 : 0;
-    };  // NOLINT
-
-    thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
-                      thrust::make_counting_iterator(nitems),
-                      dhits.begin(),
-                      DeterminePositiveLabelLambda);
-
-    // Allocator to be used by sort for managing space overhead while performing prefix scans
-    dh::XGBCachingDeviceAllocator<char> alloc;
-
-    // Next, prefix scan the positive labels that are segmented to accumulate them.
-    // This is required for computing the accumulated precisions
-    const auto &group_segments = segment_label_sorter.GetGroupSegmentsSpan();
-    // Data segmented into different groups...
-    thrust::inclusive_scan_by_key(thrust::cuda::par(alloc),
-                                  dh::tcbegin(group_segments), dh::tcend(group_segments),
-                                  dhits.begin(),  // Input value
-                                  dhits.begin());  // In-place scan
-
-    // Compute accumulated precisions for each item, assuming positive and
-    // negative instances are missing.
-    // But first, compute individual item precisions
-    const auto *dhits_arr = dhits.data().get();
-    // Group info on device
-    const auto &dgroups = segment_label_sorter.GetGroupsSpan();
-    auto ComputeItemPrecisionLambda = [=] __device__(uint32_t idx) {
-      if (unsorted_labels[pred_original_pos[idx]] > 0.0f) {
-        auto idx_within_group = (idx - dgroups[group_segments[idx]]) + 1;
-        return MAPStats{static_cast<float>(dhits_arr[idx]) / idx_within_group,
-                        static_cast<float>(dhits_arr[idx] - 1) / idx_within_group,
-                        static_cast<float>(dhits_arr[idx] + 1) / idx_within_group,
-                        1.0f};
-      }
-      return MAPStats{};
-    };  // NOLINT
-
-    thrust::transform(thrust::make_counting_iterator(static_cast<uint32_t>(0)),
-                      thrust::make_counting_iterator(nitems),
-                      this->dmap_stats_.begin(),
-                      ComputeItemPrecisionLambda);
-
-    // Lastly, compute the accumulated precisions for all the items segmented by groups.
-    // The precisions are accumulated within each group
-    thrust::inclusive_scan_by_key(thrust::cuda::par(alloc),
-                                  dh::tcbegin(group_segments), dh::tcend(group_segments),
-                                  this->dmap_stats_.begin(),  // Input map stats
-                                  this->dmap_stats_.begin());  // In-place scan and output here
-  }
-
-  inline const common::Span<const MAPStats> GetMapStatsSpan() const {
-    return { dmap_stats_.data().get(), dmap_stats_.size() };
-  }
-
-  // Type containing device pointers that can be cheaply copied on the kernel
-  class MAPLambdaWeightMultiplier : public BaseLambdaWeightMultiplier {
-   public:
-    MAPLambdaWeightMultiplier(const dh::SegmentSorter<float> &segment_label_sorter,
-                              const MAPLambdaWeightComputer &lwc)
-      : BaseLambdaWeightMultiplier(segment_label_sorter, lwc.GetPredictionSorter()),
-        dmap_stats_(lwc.GetMapStatsSpan()) {}
-
-    // Adjust the items weight by this value
-    __device__ __forceinline__ bst_float GetWeight(uint32_t gidx, int pidx, int nidx) const {
-      uint32_t group_begin = dgroups_[gidx];
-      uint32_t group_end = dgroups_[gidx + 1];
-
-      auto pos_lab_orig_posn = dorig_pos_[pidx];
-      auto neg_lab_orig_posn = dorig_pos_[nidx];
-      KERNEL_CHECK(pos_lab_orig_posn != neg_lab_orig_posn);
-
-      // Note: the label positive and negative indices are relative to the entire dataset.
-      // Hence, scale them back to an index within the group
-      auto pos_pred_pos = dindexable_sorted_preds_pos_[pos_lab_orig_posn] - group_begin;
-      auto neg_pred_pos = dindexable_sorted_preds_pos_[neg_lab_orig_posn] - group_begin;
-      return MAPLambdaWeightComputer::GetLambdaMAP(
-        pos_pred_pos, neg_pred_pos,
-        dsorted_labels_[pidx], dsorted_labels_[nidx],
-        &dmap_stats_[group_begin], group_end - group_begin);
-    }
-
-   private:
-    common::Span<const MAPStats> dmap_stats_;  // Start address of the map stats for every sorted
-                                               // prediction value
-  };
-
-  inline const MAPLambdaWeightMultiplier GetWeightMultiplier() const { return weight_multiplier_; }
-
- private:
-  dh::caching_device_vector<MAPStats> dmap_stats_;
-  // This computes the adjustment to the weight
-  const MAPLambdaWeightMultiplier weight_multiplier_;
-#endif
-};
-
-#if defined(__CUDACC__)
-class SortedLabelList : dh::SegmentSorter<float> {
- private:
-  const LambdaRankParam &param_;                      // Objective configuration
-
- public:
-  explicit SortedLabelList(const LambdaRankParam &param)
-    : param_(param) {}
-
-  // Sort the labels that are grouped by 'groups'
-  void Sort(const HostDeviceVector<bst_float> &dlabels, const std::vector<uint32_t> &groups) {
-    this->SortItems(dlabels.ConstDevicePointer(), dlabels.Size(), groups);
-  }
-
-  // This kernel can only run *after* the kernel in sort is completed, as they
-  // use the default stream
-  template <typename LambdaWeightComputerT>
-  void ComputeGradients(const bst_float *dpreds,   // Unsorted predictions
-                        const bst_float *dlabels,  // Unsorted labels
-                        const HostDeviceVector<bst_float> &weights,
-                        int iter,
-                        GradientPair *out_gpair,
-                        float weight_normalization_factor) {
-    // Group info on device
-    const auto &dgroups = this->GetGroupsSpan();
-    uint32_t ngroups = this->GetNumGroups() + 1;
-
-    uint32_t total_items = this->GetNumItems();
-    uint32_t niter = param_.num_pairsample * total_items;
-
-    float fix_list_weight = param_.fix_list_weight;
-
-    const auto &original_pos = this->GetOriginalPositionsSpan();
-
-    uint32_t num_weights = weights.Size();
-    auto dweights = num_weights ? weights.ConstDevicePointer() : nullptr;
-
-    const auto &sorted_labels = this->GetItemsSpan();
-
-    // This is used to adjust the weight of different elements based on the different ranking
-    // objective function policies
-    LambdaWeightComputerT weight_computer(dpreds, dlabels, *this);
-    auto wmultiplier = weight_computer.GetWeightMultiplier();
-
-    int device_id = -1;
-    dh::safe_cuda(cudaGetDevice(&device_id));
-    // For each instance in the group, compute the gradient pair concurrently
-    dh::LaunchN(niter, nullptr, [=] __device__(uint32_t idx) {
-      // First, determine the group 'idx' belongs to
-      uint32_t item_idx = idx % total_items;
-      uint32_t group_idx =
-          thrust::upper_bound(thrust::seq, dgroups.begin(),
-                              dgroups.begin() + ngroups, item_idx) -
-          dgroups.begin();
-      // Span of this group within the larger labels/predictions sorted tuple
-      uint32_t group_begin = dgroups[group_idx - 1];
-      uint32_t group_end = dgroups[group_idx];
-      uint32_t total_group_items = group_end - group_begin;
-
-      // Are the labels diverse enough? If they are all the same, then there is nothing to pick
-      // from another group - bail sooner
-      if (sorted_labels[group_begin] == sorted_labels[group_end - 1]) return;
-
-      // Find the number of labels less than and greater than the current label
-      // at the sorted index position item_idx
-      uint32_t nleft  = CountNumItemsToTheLeftOf(
-        sorted_labels.data() + group_begin, item_idx - group_begin + 1, sorted_labels[item_idx]);
-      uint32_t nright = CountNumItemsToTheRightOf(
-        sorted_labels.data() + item_idx, group_end - item_idx, sorted_labels[item_idx]);
-
-      // Create a minstd_rand object to act as our source of randomness
-      thrust::minstd_rand rng((iter + 1) * 1111);
-      rng.discard(((idx / total_items) * total_group_items) + item_idx - group_begin);
-      // Create a uniform_int_distribution to produce a sample from outside of the
-      // present label group
-      thrust::uniform_int_distribution<int> dist(0, nleft + nright - 1);
-
-      int sample = dist(rng);
-      int pos_idx = -1;  // Bigger label
-      int neg_idx = -1;  // Smaller label
-      // Are we picking a sample to the left/right of the current group?
-      if (sample < nleft) {
-        // Go left
-        pos_idx = sample + group_begin;
-        neg_idx = item_idx;
-      } else {
-        pos_idx = item_idx;
-        uint32_t items_in_group = total_group_items - nleft - nright;
-        neg_idx = sample + items_in_group + group_begin;
-      }
-
-      // Compute and assign the gradients now
-      const float eps = 1e-16f;
-      bst_float p = common::Sigmoid(dpreds[original_pos[pos_idx]] - dpreds[original_pos[neg_idx]]);
-      bst_float g = p - 1.0f;
-      bst_float h = thrust::max(p * (1.0f - p), eps);
-
-      // Rescale each gradient and hessian so that the group has a weighted constant
-      float scale = __frcp_ru(niter / total_items);
-      if (fix_list_weight != 0.0f) {
-        scale *= fix_list_weight / total_group_items;
-      }
-
-      float weight = num_weights ? dweights[group_idx - 1] : 1.0f;
-      weight *= weight_normalization_factor;
-      weight *= wmultiplier.GetWeight(group_idx - 1, pos_idx, neg_idx);
-      weight *= scale;
-      // Accumulate gradient and hessian in both positive and negative indices
-      const GradientPair in_pos_gpair(g * weight, 2.0f * weight * h);
-      dh::AtomicAddGpair(&out_gpair[original_pos[pos_idx]], in_pos_gpair);
-
-      const GradientPair in_neg_gpair(-g * weight, 2.0f * weight * h);
-      dh::AtomicAddGpair(&out_gpair[original_pos[neg_idx]], in_neg_gpair);
-    });
-
-    // Wait until the computations done by the kernel is complete
-    dh::safe_cuda(cudaStreamSynchronize(nullptr));
-  }
-};
-#endif
-
-// objective for lambda rank
-template <typename LambdaWeightComputerT>
-class LambdaRankObj : public ObjFunction {
- public:
-  void Configure(Args const &args) override { param_.UpdateAllowUnknown(args); }
-  ObjInfo Task() const override { return ObjInfo::kRanking; }
-
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo& info,
-                   int iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
-    CHECK_EQ(preds.Size(), info.labels.Size()) << "label size predict size not match";
-
-    // quick consistency when group is not available
-    std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(info.labels.Size());
-    const std::vector<unsigned> &gptr = info.group_ptr_.size() == 0 ? tgptr : info.group_ptr_;
-    CHECK(gptr.size() != 0 && gptr.back() == info.labels.Size())
-          << "group structure not consistent with #rows" << ", "
-          << "group ponter size: " << gptr.size() << ", "
-          << "labels size: " << info.labels.Size() << ", "
-          << "group pointer back: " << (gptr.size() == 0 ? 0 : gptr.back());
-
-#if defined(__CUDACC__)
-    // Check if we have a GPU assignment; else, revert back to CPU
-    auto device = ctx_->gpu_id;
-    if (device >= 0) {
-      ComputeGradientsOnGPU(preds, info, iter, out_gpair, gptr);
-    } else {
-      // Revert back to CPU
-#endif
-      ComputeGradientsOnCPU(preds, info, iter, out_gpair, gptr);
-#if defined(__CUDACC__)
-    }
-#endif
-  }
-
-  const char* DefaultEvalMetric() const override {
-    return "map";
-  }
-
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["name"] = String(LambdaWeightComputerT::Name());
-    out["lambda_rank_param"] = ToJson(param_);
-  }
-
-  void LoadConfig(Json const& in) override {
-    FromJson(in["lambda_rank_param"], &param_);
-  }
-
- private:
-  bst_float ComputeWeightNormalizationFactor(const MetaInfo& info,
-                                             const std::vector<unsigned> &gptr) {
-    const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    bst_float sum_weights = 0;
-    for (bst_omp_uint k = 0; k < ngroup; ++k) {
-      sum_weights += info.GetWeight(k);
-    }
-    return ngroup / sum_weights;
-  }
-
-  void ComputeGradientsOnCPU(const HostDeviceVector<bst_float>& preds,
-                             const MetaInfo& info,
-                             int iter,
-                             HostDeviceVector<GradientPair>* out_gpair,
-                             const std::vector<unsigned> &gptr) {
-    LOG(DEBUG) << "Computing " << LambdaWeightComputerT::Name() << " gradients on CPU.";
-
-    bst_float weight_normalization_factor = ComputeWeightNormalizationFactor(info, gptr);
-
-    const auto& preds_h = preds.HostVector();
-    const auto& labels = info.labels.HostView();
-    std::vector<GradientPair>& gpair = out_gpair->HostVector();
-    const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    out_gpair->Resize(preds.Size());
-
-    dmlc::OMPException exc;
-#pragma omp parallel num_threads(ctx_->Threads())
-    {
-      exc.Run([&]() {
-        // parallel construct, declare random number generator here, so that each
-        // thread use its own random number generator, seed by thread id and current iteration
-        std::minstd_rand rnd((iter + 1) * 1111);
-        std::vector<LambdaPair> pairs;
-        std::vector<ListEntry>  lst;
-        std::vector< std::pair<bst_float, unsigned> > rec;
-
-        #pragma omp for schedule(static)
-        for (bst_omp_uint k = 0; k < ngroup; ++k) {
-          exc.Run([&]() {
-            lst.clear(); pairs.clear();
-            for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
-              lst.emplace_back(preds_h[j], labels(j), j);
-              gpair[j] = GradientPair(0.0f, 0.0f);
-            }
-            std::stable_sort(lst.begin(), lst.end(), ListEntry::CmpPred);
-            rec.resize(lst.size());
-            for (unsigned i = 0; i < lst.size(); ++i) {
-              rec[i] = std::make_pair(lst[i].label, i);
-            }
-            std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
-            // enumerate buckets with same label
-            // for each item in the lst, grab another sample randomly
-            for (unsigned i = 0; i < rec.size(); ) {
-              unsigned j = i + 1;
-              while (j < rec.size() && rec[j].first == rec[i].first) ++j;
-              // bucket in [i,j), get a sample outside bucket
-              unsigned nleft = i, nright = static_cast<unsigned>(rec.size() - j);
-              if (nleft + nright != 0) {
-                int nsample = param_.num_pairsample;
-                while (nsample --) {
-                  for (unsigned pid = i; pid < j; ++pid) {
-                    unsigned ridx =
-                        std::uniform_int_distribution<unsigned>(0, nleft + nright - 1)(rnd);
-                    if (ridx < nleft) {
-                      pairs.emplace_back(rec[ridx].second, rec[pid].second,
-                          info.GetWeight(k) * weight_normalization_factor);
-                    } else {
-                      pairs.emplace_back(rec[pid].second, rec[ridx+j-i].second,
-                          info.GetWeight(k) * weight_normalization_factor);
-                    }
-                  }
-                }
-              }
-              i = j;
-            }
-            // get lambda weight for the pairs
-            LambdaWeightComputerT::GetLambdaWeight(lst, &pairs);
-            // rescale each gradient and hessian so that the lst have constant weighted
-            float scale = 1.0f / param_.num_pairsample;
-            if (param_.fix_list_weight != 0.0f) {
-              scale *= param_.fix_list_weight / (gptr[k + 1] - gptr[k]);
-            }
-            for (auto & pair : pairs) {
-              const ListEntry &pos = lst[pair.pos_index];
-              const ListEntry &neg = lst[pair.neg_index];
-              const bst_float w = pair.weight * scale;
-              const float eps = 1e-16f;
-              bst_float p = common::Sigmoid(pos.pred - neg.pred);
-              bst_float g = p - 1.0f;
-              bst_float h = std::max(p * (1.0f - p), eps);
-              // accumulate gradient and hessian in both pid, and nid
-              gpair[pos.rindex] += GradientPair(g * w, 2.0f*w*h);
-              gpair[neg.rindex] += GradientPair(-g * w, 2.0f*w*h);
-            }
-          });
-        }
-      });
-    }
-    exc.Rethrow();
-  }
-
-#if defined(__CUDACC__)
-  void ComputeGradientsOnGPU(const HostDeviceVector<bst_float>& preds,
-                             const MetaInfo& info,
-                             int iter,
-                             HostDeviceVector<GradientPair>* out_gpair,
-                             const std::vector<unsigned> &gptr) {
-    LOG(DEBUG) << "Computing " << LambdaWeightComputerT::Name() << " gradients on GPU.";
-
-    auto device = ctx_->gpu_id;
-    dh::safe_cuda(cudaSetDevice(device));
-
-    bst_float weight_normalization_factor = ComputeWeightNormalizationFactor(info, gptr);
-
-    // Set the device ID and copy them to the device
-    out_gpair->SetDevice(device);
-    info.labels.SetDevice(device);
-    preds.SetDevice(device);
-    info.weights_.SetDevice(device);
-
-    out_gpair->Resize(preds.Size());
-
-    auto d_preds = preds.ConstDevicePointer();
-    auto d_gpair = out_gpair->DevicePointer();
-    auto d_labels = info.labels.View(device);
-
-    SortedLabelList slist(param_);
-
-    // Sort the labels within the groups on the device
-    slist.Sort(*info.labels.Data(), gptr);
-
-    // Initialize the gradients next
-    out_gpair->Fill(GradientPair(0.0f, 0.0f));
-
-    // Finally, compute the gradients
-    slist.ComputeGradients<LambdaWeightComputerT>(d_preds, d_labels.Values().data(), info.weights_,
-                                                  iter, d_gpair, weight_normalization_factor);
-  }
-#endif
-
-  LambdaRankParam param_;
-};
-
-#if !defined(GTEST_TEST)
-// register the objective functions
-DMLC_REGISTER_PARAMETER(LambdaRankParam);
-
-XGBOOST_REGISTER_OBJECTIVE(PairwiseRankObj, PairwiseLambdaWeightComputer::Name())
-.describe("Pairwise rank objective.")
-.set_body([]() { return new LambdaRankObj<PairwiseLambdaWeightComputer>(); });
-
-XGBOOST_REGISTER_OBJECTIVE(LambdaRankObjMAP, MAPLambdaWeightComputer::Name())
-.describe("LambdaRank with MAP as objective.")
-.set_body([]() { return new LambdaRankObj<MAPLambdaWeightComputer>(); });
-#endif
-
-}  // namespace obj
-}  // namespace xgboost
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index d02a55c1b..c808e97f0 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -223,4 +223,125 @@ TEST(LambdaRank, MakePair) {
     ASSERT_EQ(n_pairs, info.num_row_ * param.NumPair());
   }
 }
+
+void TestMAPStat(Context const* ctx) {
+  auto p_fmat = EmptyDMatrix();
+  MetaInfo& info = p_fmat->Info();
+  ltr::LambdaRankParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  {
+    std::vector<float> h_data{1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 1.0f};
+    info.labels.Reshape(h_data.size(), 1);
+    info.labels.Data()->HostVector() = h_data;
+    info.num_row_ = h_data.size();
+
+    HostDeviceVector<float> predt;
+    auto& h_predt = predt.HostVector();
+    h_predt.resize(h_data.size());
+    std::iota(h_predt.rbegin(), h_predt.rend(), 0.0f);
+
+    auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
+
+    predt.SetDevice(ctx->gpu_id);
+    auto rank_idx =
+        p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
+
+    if (ctx->IsCPU()) {
+      obj::cpu_impl::MAPStat(ctx, info.labels.HostView().Slice(linalg::All(), 0), rank_idx,
+                             p_cache);
+    } else {
+      obj::cuda_impl::MAPStat(ctx, info, rank_idx, p_cache);
+    }
+
+    Context cpu_ctx;
+    auto n_rel = p_cache->NumRelevant(&cpu_ctx);
+    auto acc = p_cache->Acc(&cpu_ctx);
+
+    ASSERT_EQ(n_rel[0], 1.0);
+    ASSERT_EQ(acc[0], 1.0);
+
+    ASSERT_EQ(n_rel.back(), h_data.size() - 1.0);
+    ASSERT_NEAR(acc.back(), 1.95 + (1.0 / h_data.size()), kRtEps);
+  }
+  {
+    info.labels.Reshape(16);
+    auto& h_label = info.labels.Data()->HostVector();
+    info.group_ptr_ = {0, 8, 16};
+    info.num_row_ = info.labels.Shape(0);
+
+    std::fill_n(h_label.begin(), 8, 1.0f);
+    std::fill_n(h_label.begin() + 8, 8, 0.0f);
+    HostDeviceVector<float> predt;
+    auto& h_predt = predt.HostVector();
+    h_predt.resize(h_label.size());
+    std::iota(h_predt.rbegin(), h_predt.rbegin() + 8, 0.0f);
+    std::iota(h_predt.rbegin() + 8, h_predt.rend(), 0.0f);
+
+    auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
+
+    predt.SetDevice(ctx->gpu_id);
+    auto rank_idx =
+        p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
+
+    if (ctx->IsCPU()) {
+      obj::cpu_impl::MAPStat(ctx, info.labels.HostView().Slice(linalg::All(), 0), rank_idx,
+                             p_cache);
+    } else {
+      obj::cuda_impl::MAPStat(ctx, info, rank_idx, p_cache);
+    }
+
+    Context cpu_ctx;
+    auto n_rel = p_cache->NumRelevant(&cpu_ctx);
+    ASSERT_EQ(n_rel[7], 8);      // first group
+    ASSERT_EQ(n_rel.back(), 0);  // second group
+  }
+}
+
+TEST(LambdaRank, MAPStat) {
+  Context ctx;
+  TestMAPStat(&ctx);
+}
+
+void TestMAPGPair(Context const* ctx) {
+  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:map", ctx)};
+  Args args;
+  obj->Configure(args);
+
+  CheckConfigReload(obj, "rank:map");
+
+  CheckRankingObjFunction(obj,                                                 // obj
+                          {0, 0.1f, 0, 0.1f},                                  // score
+                          {0, 1, 0, 1},                                        // label
+                          {2.0f, 2.0f},                                        // weight
+                          {0, 2, 4},                                           // group
+                          {1.2054923f, -1.2054923f, 1.2054923f, -1.2054923f},  // out grad
+                          {1.2657166f, 1.2657166f, 1.2657166f, 1.2657166f});
+  // disable the second query group with 0 weight
+  CheckRankingObjFunction(obj,                                  // obj
+                          {0, 0.1f, 0, 0.1f},                   // score
+                          {0, 1, 0, 1},                         // label
+                          {2.0f, 0.0f},                         // weight
+                          {0, 2, 4},                            // group
+                          {1.2054923f, -1.2054923f, .0f, .0f},  // out grad
+                          {1.2657166f, 1.2657166f, .0f, .0f});
+}
+
+TEST(LambdaRank, MAPGPair) {
+  Context ctx;
+  TestMAPGPair(&ctx);
+}
+
+void TestPairWiseGPair(Context const* ctx) {
+  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:pairwise", ctx)};
+  Args args;
+  obj->Configure(args);
+
+  args.emplace_back("lambdarank_unbiased", "true");
+}
+
+TEST(LambdaRank, Pairwise) {
+  Context ctx;
+  TestPairWiseGPair(&ctx);
+}
 }  // namespace xgboost::obj
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index 01d020dda..d0f448993 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -18,6 +18,12 @@ TEST(LambdaRank, GPUNDCGJsonIO) {
   TestNDCGJsonIO(&ctx);
 }
 
+TEST(LambdaRank, GPUMAPStat) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestMAPStat(&ctx);
+}
+
 TEST(LambdaRank, GPUNDCGGPair) {
   Context ctx;
   ctx.gpu_id = 0;
@@ -153,4 +159,10 @@ TEST(LambdaRank, RankItemCountOnRight) {
   RankItemCountImpl(sorted_items, wrapper, 1, static_cast<uint32_t>(1));
   RankItemCountImpl(sorted_items, wrapper, 0, static_cast<uint32_t>(0));
 }
+
+TEST(LambdaRank, GPUMAPGPair) {
+  Context ctx;
+  ctx.gpu_id = 0;
+  TestMAPGPair(&ctx);
+}
 }  // namespace xgboost::obj
diff --git a/tests/cpp/objective/test_lambdarank_obj.h b/tests/cpp/objective/test_lambdarank_obj.h
index aebe3ad54..9539f1a30 100644
--- a/tests/cpp/objective/test_lambdarank_obj.h
+++ b/tests/cpp/objective/test_lambdarank_obj.h
@@ -18,6 +18,8 @@
 #include "../helpers.h"                             // for EmptyDMatrix
 
 namespace xgboost::obj {
+void TestMAPStat(Context const* ctx);
+
 inline void TestNDCGJsonIO(Context const* ctx) {
   std::unique_ptr<xgboost::ObjFunction> obj{ObjFunction::Create("rank:ndcg", ctx)};
 
@@ -37,6 +39,8 @@ void TestNDCGGPair(Context const* ctx);
 
 void TestUnbiasedNDCG(Context const* ctx);
 
+void TestMAPGPair(Context const* ctx);
+
 /**
  * \brief Initialize test data for make pair tests.
  */
diff --git a/tests/cpp/objective/test_ranking_obj.cc b/tests/cpp/objective/test_ranking_obj.cc
deleted file mode 100644
index 2072f530e..000000000
--- a/tests/cpp/objective/test_ranking_obj.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright by Contributors
-#include <xgboost/context.h>
-#include <xgboost/json.h>
-#include <xgboost/objective.h>
-
-#include "../helpers.h"
-
-namespace xgboost {
-
-TEST(Objective, DeclareUnifiedTest(PairwiseRankingGPair)) {
-  std::vector<std::pair<std::string, std::string>> args;
-  xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:pairwise", &ctx)};
-  obj->Configure(args);
-  CheckConfigReload(obj, "rank:pairwise");
-
-  // Test with setting sample weight to second query group
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {2.0f, 0.0f},
-                          {0, 2, 4},
-                          {1.9f, -1.9f, 0.0f, 0.0f},
-                          {1.995f, 1.995f, 0.0f, 0.0f});
-
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {1.0f, 1.0f},
-                          {0, 2, 4},
-                          {0.95f, -0.95f,  0.95f, -0.95f},
-                          {0.9975f, 0.9975f, 0.9975f, 0.9975f});
-
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
-TEST(Objective, DeclareUnifiedTest(PairwiseRankingGPairSameLabels)) {
-  std::vector<std::pair<std::string, std::string>> args;
-  xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("rank:pairwise", &ctx)};
-  obj->Configure(args);
-  // No computation of gradient/hessian, as there is no diversity in labels
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {1,   1, 1, 1},
-                          {2.0f, 0.0f},
-                          {0, 2, 4},
-                          {0.0f, 0.0f, 0.0f, 0.0f},
-                          {0.0f, 0.0f, 0.0f, 0.0f});
-
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
-TEST(Objective, DeclareUnifiedTest(MAPRankingGPair)) {
-  std::vector<std::pair<std::string, std::string>> args;
-  xgboost::Context ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create("rank:map", &ctx)};
-  obj->Configure(args);
-  CheckConfigReload(obj, "rank:map");
-
-  // Test with setting sample weight to second query group
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {2.0f, 0.0f},
-                          {0, 2, 4},
-                          {0.95f, -0.95f,  0.0f, 0.0f},
-                          {0.9975f, 0.9975f, 0.0f, 0.0f});
-
-  CheckRankingObjFunction(obj,
-                          {0, 0.1f, 0, 0.1f},
-                          {0,   1, 0, 1},
-                          {1.0f, 1.0f},
-                          {0, 2, 4},
-                          {0.475f, -0.475f,  0.475f, -0.475f},
-                          {0.4988f, 0.4988f, 0.4988f, 0.4988f});
-  ASSERT_NO_THROW(obj->DefaultEvalMetric());
-}
-
-}  // namespace xgboost
diff --git a/tests/cpp/objective/test_ranking_obj_gpu.cu b/tests/cpp/objective/test_ranking_obj_gpu.cu
deleted file mode 100644
index cd40b4928..000000000
--- a/tests/cpp/objective/test_ranking_obj_gpu.cu
+++ /dev/null
@@ -1,175 +0,0 @@
-/*!
- * Copyright 2019-2021 by XGBoost Contributors
- */
-#include <thrust/host_vector.h>
-
-#include "test_ranking_obj.cc"
-#include "../../../src/objective/rank_obj.cu"
-
-namespace xgboost {
-
-template <typename T = uint32_t, typename Comparator = thrust::greater<T>>
-std::unique_ptr<dh::SegmentSorter<T>>
-RankSegmentSorterTestImpl(const std::vector<uint32_t> &group_indices,
-                          const std::vector<T> &hlabels,
-                          const std::vector<T> &expected_sorted_hlabels,
-                          const std::vector<uint32_t> &expected_orig_pos
-                          ) {
-  std::unique_ptr<dh::SegmentSorter<T>> seg_sorter_ptr(new dh::SegmentSorter<T>);
-  dh::SegmentSorter<T> &seg_sorter(*seg_sorter_ptr);
-
-  // Create a bunch of unsorted labels on the device and sort it via the segment sorter
-  dh::device_vector<T> dlabels(hlabels);
-  seg_sorter.SortItems(dlabels.data().get(), dlabels.size(), group_indices, Comparator());
-
-  auto num_items = seg_sorter.GetItemsSpan().size();
-  EXPECT_EQ(num_items, group_indices.back());
-  EXPECT_EQ(seg_sorter.GetNumGroups(), group_indices.size() - 1);
-
-  // Check the labels
-  dh::device_vector<T> sorted_dlabels(num_items);
-  sorted_dlabels.assign(dh::tcbegin(seg_sorter.GetItemsSpan()),
-                        dh::tcend(seg_sorter.GetItemsSpan()));
-  thrust::host_vector<T> sorted_hlabels(sorted_dlabels);
-  EXPECT_EQ(expected_sorted_hlabels, sorted_hlabels);
-
-  // Check the indices
-  dh::device_vector<uint32_t> dorig_pos(num_items);
-  dorig_pos.assign(dh::tcbegin(seg_sorter.GetOriginalPositionsSpan()),
-                   dh::tcend(seg_sorter.GetOriginalPositionsSpan()));
-  dh::device_vector<uint32_t> horig_pos(dorig_pos);
-  EXPECT_EQ(expected_orig_pos, horig_pos);
-
-  return seg_sorter_ptr;
-}
-
-TEST(Objective, RankSegmentSorterTest) {
-  RankSegmentSorterTestImpl({0, 2, 4, 7, 10, 14, 18, 22, 26},  // Groups
-                            {1, 1,                             // Labels
-                             1, 2,
-                             3, 2, 1,
-                             1, 2, 1,
-                             1, 3, 4, 2,
-                             1, 2, 1, 1,
-                             1, 2, 2, 3,
-                             3, 3, 1, 2},
-                            {1, 1,                             // Expected sorted labels
-                             2, 1,
-                             3, 2, 1,
-                             2, 1, 1,
-                             4, 3, 2, 1,
-                             2, 1, 1, 1,
-                             3, 2, 2, 1,
-                             3, 3, 2, 1},
-                            {0, 1,                             // Expected original positions
-                             3, 2,
-                             4, 5, 6,
-                             8, 7, 9,
-                             12, 11, 13, 10,
-                             15, 14, 16, 17,
-                             21, 19, 20, 18,
-                             22, 23, 25, 24});
-}
-
-TEST(Objective, RankSegmentSorterSingleGroupTest) {
-  RankSegmentSorterTestImpl({0, 7},                  // Groups
-                            {6, 1, 4, 3, 0, 5, 2},   // Labels
-                            {6, 5, 4, 3, 2, 1, 0},   // Expected sorted labels
-                            {0, 5, 2, 3, 6, 1, 4});  // Expected original positions
-}
-
-TEST(Objective, RankSegmentSorterAscendingTest) {
-  RankSegmentSorterTestImpl<uint32_t, thrust::less<uint32_t>>(
-                                                    {0, 4, 7},    // Groups
-                                                    {3, 1, 4, 2,  // Labels
-                                                     6, 5, 7},
-                                                    {1, 2, 3, 4,  // Expected sorted labels
-                                                     5, 6, 7},
-                                                    {1, 3, 0, 2,  // Expected original positions
-                                                     5, 4, 6});
-}
-
-TEST(Objective, IndexableSortedItemsTest) {
-  std::vector<float> hlabels = {3.1f, 1.2f, 2.3f, 4.4f,        // Labels
-                                7.8f, 5.01f, 6.96f,
-                                10.3f, 8.7f, 11.4f, 9.45f, 11.4f};
-  dh::device_vector<bst_float> dlabels(hlabels);
-
-  auto segment_label_sorter = RankSegmentSorterTestImpl<float>(
-    {0, 4, 7, 12},                  // Groups
-    hlabels,
-    {4.4f, 3.1f, 2.3f, 1.2f,        // Expected sorted labels
-     7.8f, 6.96f, 5.01f,
-     11.4f, 11.4f, 10.3f, 9.45f, 8.7f},
-    {3, 0, 2, 1,                    // Expected original positions
-     4, 6, 5,
-     9, 11, 7, 10, 8});
-
-  segment_label_sorter->CreateIndexableSortedPositions();
-  std::vector<uint32_t> sorted_indices(segment_label_sorter->GetNumItems());
-  dh::CopyDeviceSpanToVector(&sorted_indices,
-                             segment_label_sorter->GetIndexableSortedPositionsSpan());
-  std::vector<uint32_t> expected_sorted_indices = {
-    1, 3, 2, 0,
-    4, 6, 5,
-    9, 11, 7, 10, 8};
-  EXPECT_EQ(expected_sorted_indices, sorted_indices);
-}
-
-TEST(Objective, ComputeAndCompareMAPStatsTest) {
-  std::vector<float> hlabels = {3.1f, 0.0f, 2.3f, 4.4f,        // Labels
-                                0.0f, 5.01f, 0.0f,
-                                10.3f, 0.0f, 11.4f, 9.45f, 11.4f};
-  dh::device_vector<bst_float> dlabels(hlabels);
-
-  auto segment_label_sorter = RankSegmentSorterTestImpl<float>(
-    {0, 4, 7, 12},                  // Groups
-    hlabels,
-    {4.4f, 3.1f, 2.3f, 0.0f,        // Expected sorted labels
-     5.01f, 0.0f, 0.0f,
-     11.4f, 11.4f, 10.3f, 9.45f, 0.0f},
-    {3, 0, 2, 1,                    // Expected original positions
-     5, 4, 6,
-     9, 11, 7, 10, 8});
-
-  // Create MAP stats on the device first using the objective
-  std::vector<bst_float> hpreds{-9.78f, 24.367f, 0.908f, -11.47f,
-                                -1.03f, -2.79f, -3.1f,
-                                104.22f, 103.1f, -101.7f, 100.5f, 45.1f};
-  dh::device_vector<bst_float> dpreds(hpreds);
-
-  xgboost::obj::MAPLambdaWeightComputer map_lw_computer(dpreds.data().get(),
-                                                        dlabels.data().get(),
-                                                        *segment_label_sorter);
-
-  // Get the device MAP stats on host
-  std::vector<xgboost::obj::MAPLambdaWeightComputer::MAPStats> dmap_stats(
-    segment_label_sorter->GetNumItems());
-  dh::CopyDeviceSpanToVector(&dmap_stats, map_lw_computer.GetMapStatsSpan());
-
-  // Compute the MAP stats on host next to compare
-  std::vector<uint32_t> hgroups(segment_label_sorter->GetNumGroups() + 1);
-  dh::CopyDeviceSpanToVector(&hgroups, segment_label_sorter->GetGroupsSpan());
-
-  for (size_t i = 0; i < hgroups.size() - 1; ++i) {
-    auto gbegin = hgroups[i];
-    auto gend = hgroups[i + 1];
-    std::vector<xgboost::obj::ListEntry> lst_entry;
-    for (auto j = gbegin; j < gend; ++j) {
-      lst_entry.emplace_back(hpreds[j], hlabels[j], j);
-    }
-    std::stable_sort(lst_entry.begin(), lst_entry.end(), xgboost::obj::ListEntry::CmpPred);
-
-    // Compute the MAP stats with this list and compare with the ones computed on the device
-    std::vector<xgboost::obj::MAPLambdaWeightComputer::MAPStats> hmap_stats;
-    xgboost::obj::MAPLambdaWeightComputer::GetMAPStats(lst_entry, &hmap_stats);
-    for (auto j = gbegin; j < gend; ++j) {
-      EXPECT_EQ(dmap_stats[j].hits, hmap_stats[j - gbegin].hits);
-      EXPECT_NEAR(dmap_stats[j].ap_acc, hmap_stats[j - gbegin].ap_acc, 0.01f);
-      EXPECT_NEAR(dmap_stats[j].ap_acc_miss, hmap_stats[j - gbegin].ap_acc_miss, 0.01f);
-      EXPECT_NEAR(dmap_stats[j].ap_acc_add, hmap_stats[j - gbegin].ap_acc_add, 0.01f);
-    }
-  }
-}
-
-}  // namespace xgboost
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 67620e6dd..e0d3d680b 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -176,7 +176,7 @@ def test_ranking():
 def test_ranking_metric() -> None:
     from sklearn.metrics import roc_auc_score
 
-    X, y, qid, w = tm.make_ltr(512, 4, 3, 2)
+    X, y, qid, w = tm.make_ltr(512, 4, 3, 1)
     # use auc for test as ndcg_score in sklearn works only on label gain instead of exp
     # gain.
     # note that the auc in sklearn is different from the one in XGBoost. The one in
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index a5e0f028a..6d88323ac 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -1343,61 +1343,94 @@ class XgboostLocalTest(SparkTestCase):
             SparkXGBClassifier(evals_result={})
 
 
-class XgboostRankerLocalTest(SparkTestCase):
-    def setUp(self):
-        self.session.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "8")
-        self.ranker_df_train = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0, 0),
-                (Vectors.dense(4.0, 5.0, 6.0), 1, 0),
-                (Vectors.dense(9.0, 4.0, 8.0), 2, 0),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 1),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 1),
-                (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 1),
-            ],
-            ["features", "label", "qid"],
-        )
-        self.ranker_df_test = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.5, 2.0, 3.0), 0, -1.87988),
-                (Vectors.dense(4.5, 5.0, 6.0), 0, 0.29556),
-                (Vectors.dense(9.0, 4.5, 8.0), 0, 2.36570),
-                (Vectors.sparse(3, {1: 1.0, 2: 6.0}), 1, -1.87988),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.0}), 1, -0.30612),
-                (Vectors.sparse(3, {1: 8.0, 2: 10.5}), 1, 2.44826),
-            ],
-            ["features", "qid", "expected_prediction"],
-        )
-        self.ranker_df_train_1 = self.session.createDataFrame(
-            [
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 9),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 9),
-                (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 9),
-                (Vectors.dense(1.0, 2.0, 3.0), 0, 8),
-                (Vectors.dense(4.0, 5.0, 6.0), 1, 8),
-                (Vectors.dense(9.0, 4.0, 8.0), 2, 8),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 7),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 7),
-                (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 7),
-                (Vectors.dense(1.0, 2.0, 3.0), 0, 6),
-                (Vectors.dense(4.0, 5.0, 6.0), 1, 6),
-                (Vectors.dense(9.0, 4.0, 8.0), 2, 6),
-            ]
-            * 4,
-            ["features", "label", "qid"],
-        )
+LTRData = namedtuple("LTRData", ("df_train", "df_test", "df_train_1"))
 
-    def test_ranker(self):
-        ranker = SparkXGBRanker(qid_col="qid")
+
+@pytest.fixture
+def ltr_data(spark: SparkSession) -> Generator[LTRData, None, None]:
+    spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "8")
+    ranker_df_train = spark.createDataFrame(
+        [
+            (Vectors.dense(1.0, 2.0, 3.0), 0, 0),
+            (Vectors.dense(4.0, 5.0, 6.0), 1, 0),
+            (Vectors.dense(9.0, 4.0, 8.0), 2, 0),
+            (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 1),
+            (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 1),
+            (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 1),
+        ],
+        ["features", "label", "qid"],
+    )
+    X_train = np.array(
+        [
+            [1.0, 2.0, 3.0],
+            [4.0, 5.0, 6.0],
+            [9.0, 4.0, 8.0],
+            [np.NaN, 1.0, 5.5],
+            [np.NaN, 6.0, 7.5],
+            [np.NaN, 8.0, 9.5],
+        ]
+    )
+    qid_train = np.array([0, 0, 0, 1, 1, 1])
+    y_train = np.array([0, 1, 2, 0, 1, 2])
+
+    X_test = np.array(
+        [
+            [1.5, 2.0, 3.0],
+            [4.5, 5.0, 6.0],
+            [9.0, 4.5, 8.0],
+            [np.NaN, 1.0, 6.0],
+            [np.NaN, 6.0, 7.0],
+            [np.NaN, 8.0, 10.5],
+        ]
+    )
+
+    ltr = xgb.XGBRanker(tree_method="approx", objective="rank:pairwise")
+    ltr.fit(X_train, y_train, qid=qid_train)
+    predt = ltr.predict(X_test)
+
+    ranker_df_test = spark.createDataFrame(
+        [
+            (Vectors.dense(1.5, 2.0, 3.0), 0, float(predt[0])),
+            (Vectors.dense(4.5, 5.0, 6.0), 0, float(predt[1])),
+            (Vectors.dense(9.0, 4.5, 8.0), 0, float(predt[2])),
+            (Vectors.sparse(3, {1: 1.0, 2: 6.0}), 1, float(predt[3])),
+            (Vectors.sparse(3, {1: 6.0, 2: 7.0}), 1, float(predt[4])),
+            (Vectors.sparse(3, {1: 8.0, 2: 10.5}), 1, float(predt[5])),
+        ],
+        ["features", "qid", "expected_prediction"],
+    )
+    ranker_df_train_1 = spark.createDataFrame(
+        [
+            (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 9),
+            (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 9),
+            (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 9),
+            (Vectors.dense(1.0, 2.0, 3.0), 0, 8),
+            (Vectors.dense(4.0, 5.0, 6.0), 1, 8),
+            (Vectors.dense(9.0, 4.0, 8.0), 2, 8),
+            (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 7),
+            (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 7),
+            (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 7),
+            (Vectors.dense(1.0, 2.0, 3.0), 0, 6),
+            (Vectors.dense(4.0, 5.0, 6.0), 1, 6),
+            (Vectors.dense(9.0, 4.0, 8.0), 2, 6),
+        ]
+        * 4,
+        ["features", "label", "qid"],
+    )
+    yield LTRData(ranker_df_train, ranker_df_test, ranker_df_train_1)
+
+
+class TestPySparkLocalLETOR:
+    def test_ranker(self, ltr_data: LTRData) -> None:
+        ranker = SparkXGBRanker(qid_col="qid", objective="rank:pairwise")
         assert ranker.getOrDefault(ranker.objective) == "rank:pairwise"
-        model = ranker.fit(self.ranker_df_train)
-        pred_result = model.transform(self.ranker_df_test).collect()
-
+        model = ranker.fit(ltr_data.df_train)
+        pred_result = model.transform(ltr_data.df_test).collect()
         for row in pred_result:
             assert np.isclose(row.prediction, row.expected_prediction, rtol=1e-3)
 
-    def test_ranker_qid_sorted(self):
-        ranker = SparkXGBRanker(qid_col="qid", num_workers=4)
-        assert ranker.getOrDefault(ranker.objective) == "rank:pairwise"
-        model = ranker.fit(self.ranker_df_train_1)
-        model.transform(self.ranker_df_test).collect()
+    def test_ranker_qid_sorted(self, ltr_data: LTRData) -> None:
+        ranker = SparkXGBRanker(qid_col="qid", num_workers=4, objective="rank:ndcg")
+        assert ranker.getOrDefault(ranker.objective) == "rank:ndcg"
+        model = ranker.fit(ltr_data.df_train_1)
+        model.transform(ltr_data.df_test).collect()

From fb941262b461281eb784e6ff27dfd0919338ac34 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 28 Apr 2023 01:03:21 -0700
Subject: [PATCH 30/34] Add demo for vertical federated learning (#9103)

---
 demo/nvflare/README.md                        | 61 ++----------
 demo/nvflare/config/config_fed_client.json    | 23 -----
 demo/nvflare/config/config_fed_server.json    | 22 -----
 demo/nvflare/horizontal/README.md             | 63 ++++++++++++
 .../{ => horizontal}/custom/controller.py     |  0
 .../{ => horizontal}/custom/trainer.py        |  0
 demo/nvflare/{ => horizontal}/prepare_data.sh |  4 +-
 demo/nvflare/vertical/README.md               | 59 +++++++++++
 demo/nvflare/vertical/custom/controller.py    | 68 +++++++++++++
 demo/nvflare/vertical/custom/trainer.py       | 97 +++++++++++++++++++
 demo/nvflare/vertical/prepare_data.sh         | 65 +++++++++++++
 11 files changed, 360 insertions(+), 102 deletions(-)
 delete mode 100755 demo/nvflare/config/config_fed_client.json
 delete mode 100755 demo/nvflare/config/config_fed_server.json
 create mode 100644 demo/nvflare/horizontal/README.md
 rename demo/nvflare/{ => horizontal}/custom/controller.py (100%)
 rename demo/nvflare/{ => horizontal}/custom/trainer.py (100%)
 rename demo/nvflare/{ => horizontal}/prepare_data.sh (88%)
 create mode 100644 demo/nvflare/vertical/README.md
 create mode 100644 demo/nvflare/vertical/custom/controller.py
 create mode 100644 demo/nvflare/vertical/custom/trainer.py
 create mode 100755 demo/nvflare/vertical/prepare_data.sh

diff --git a/demo/nvflare/README.md b/demo/nvflare/README.md
index 328dd7212..93f388208 100644
--- a/demo/nvflare/README.md
+++ b/demo/nvflare/README.md
@@ -3,61 +3,12 @@
 This directory contains a demo of Federated Learning using
 [NVFlare](https://nvidia.github.io/NVFlare/).
 
-## Training with CPU only
+## Horizontal Federated XGBoost
 
-To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+For horizontal federated learning using XGBoost (data is split row-wise), check out the `horizontal` directory
+(see the [README](horizontal/README.md)).
 
-Install NVFlare (note that currently NVFlare only supports Python 3.8):
-```shell
-pip install nvflare
-```
+## Vertical Federated XGBoost
 
-Prepare the data:
-```shell
-./prepare_data.sh
-```
-
-Start the NVFlare federated server:
-```shell
-/tmp/nvflare/poc/server/startup/start.sh
-```
-
-In another terminal, start the first worker:
-```shell
-/tmp/nvflare/poc/site-1/startup/start.sh
-```
-
-And the second worker:
-```shell
-/tmp/nvflare/poc/site-2/startup/start.sh
-```
-
-Then start the admin CLI:
-```shell
-/tmp/nvflare/poc/admin/startup/fl_admin.sh
-```
-
-In the admin CLI, run the following command:
-```shell
-submit_job hello-xgboost
-```
-
-Once the training finishes, the model file should be written into
-`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
-respectively.
-
-Finally, shutdown everything from the admin CLI, using `admin` as password:
-```shell
-shutdown client
-shutdown server
-```
-
-## Training with GPUs
-
-To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
-Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
-turned off (see the [README](../../plugin/federated/README.md)).
-
-Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
-above.
+For vertical federated learning using XGBoost (data is split column-wise), check out the `vertical` directory
+(see the [README](vertical/README.md)).
diff --git a/demo/nvflare/config/config_fed_client.json b/demo/nvflare/config/config_fed_client.json
deleted file mode 100755
index c15a1997c..000000000
--- a/demo/nvflare/config/config_fed_client.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "format_version": 2,
-  "executors": [
-    {
-      "tasks": [
-        "train"
-      ],
-      "executor": {
-        "path": "trainer.XGBoostTrainer",
-        "args": {
-          "server_address": "localhost:9091",
-          "world_size": 2,
-          "server_cert_path": "server-cert.pem",
-          "client_key_path": "client-key.pem",
-          "client_cert_path": "client-cert.pem",
-          "use_gpus": "false"
-        }
-      }
-    }
-  ],
-  "task_result_filters": [],
-  "task_data_filters": []
-}
diff --git a/demo/nvflare/config/config_fed_server.json b/demo/nvflare/config/config_fed_server.json
deleted file mode 100755
index 32993b652..000000000
--- a/demo/nvflare/config/config_fed_server.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "format_version": 2,
-  "server": {
-    "heart_beat_timeout": 600
-  },
-  "task_data_filters": [],
-  "task_result_filters": [],
-  "workflows": [
-    {
-      "id": "server_workflow",
-      "path": "controller.XGBoostController",
-      "args": {
-        "port": 9091,
-        "world_size": 2,
-        "server_key_path": "server-key.pem",
-        "server_cert_path": "server-cert.pem",
-        "client_cert_path": "client-cert.pem"
-      }
-    }
-  ],
-  "components": []
-}
diff --git a/demo/nvflare/horizontal/README.md b/demo/nvflare/horizontal/README.md
new file mode 100644
index 000000000..93ea3794c
--- /dev/null
+++ b/demo/nvflare/horizontal/README.md
@@ -0,0 +1,63 @@
+# Experimental Support of Horizontal Federated XGBoost using NVFlare
+
+This directory contains a demo of Horizontal Federated Learning using
+[NVFlare](https://nvidia.github.io/NVFlare/).
+
+## Training with CPU only
+
+To run the demo, first build XGBoost with the federated learning plugin enabled (see the
+[README](../../plugin/federated/README.md)).
+
+Install NVFlare (note that currently NVFlare only supports Python 3.8):
+```shell
+pip install nvflare
+```
+
+Prepare the data:
+```shell
+./prepare_data.sh
+```
+
+Start the NVFlare federated server:
+```shell
+/tmp/nvflare/poc/server/startup/start.sh
+```
+
+In another terminal, start the first worker:
+```shell
+/tmp/nvflare/poc/site-1/startup/start.sh
+```
+
+And the second worker:
+```shell
+/tmp/nvflare/poc/site-2/startup/start.sh
+```
+
+Then start the admin CLI:
+```shell
+/tmp/nvflare/poc/admin/startup/fl_admin.sh
+```
+
+In the admin CLI, run the following command:
+```shell
+submit_job horizontal-xgboost
+```
+
+Once the training finishes, the model file should be written into
+`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
+respectively.
+
+Finally, shutdown everything from the admin CLI, using `admin` as password:
+```shell
+shutdown client
+shutdown server
+```
+
+## Training with GPUs
+
+To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
+Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
+turned off (see the [README](../../plugin/federated/README.md)).
+
+Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+above.
diff --git a/demo/nvflare/custom/controller.py b/demo/nvflare/horizontal/custom/controller.py
similarity index 100%
rename from demo/nvflare/custom/controller.py
rename to demo/nvflare/horizontal/custom/controller.py
diff --git a/demo/nvflare/custom/trainer.py b/demo/nvflare/horizontal/custom/trainer.py
similarity index 100%
rename from demo/nvflare/custom/trainer.py
rename to demo/nvflare/horizontal/custom/trainer.py
diff --git a/demo/nvflare/prepare_data.sh b/demo/nvflare/horizontal/prepare_data.sh
similarity index 88%
rename from demo/nvflare/prepare_data.sh
rename to demo/nvflare/horizontal/prepare_data.sh
index 1c88c65fe..6a32008f8 100755
--- a/demo/nvflare/prepare_data.sh
+++ b/demo/nvflare/horizontal/prepare_data.sh
@@ -15,8 +15,8 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.train ag
 split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.test agaricus.txt.test-site-
 
 nvflare poc -n 2 --prepare
-mkdir -p /tmp/nvflare/poc/admin/transfer/hello-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/hello-xgboost
+mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
+cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for id in $(eval echo "{1..$world_size}"); do
   cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$id"/
diff --git a/demo/nvflare/vertical/README.md b/demo/nvflare/vertical/README.md
new file mode 100644
index 000000000..83c3111b6
--- /dev/null
+++ b/demo/nvflare/vertical/README.md
@@ -0,0 +1,59 @@
+# Experimental Support of Vertical Federated XGBoost using NVFlare
+
+This directory contains a demo of Vertical Federated Learning using
+[NVFlare](https://nvidia.github.io/NVFlare/).
+
+## Training with CPU only
+
+To run the demo, first build XGBoost with the federated learning plugin enabled (see the
+[README](../../plugin/federated/README.md)).
+
+Install NVFlare (note that currently NVFlare only supports Python 3.8):
+```shell
+pip install nvflare
+```
+
+Prepare the data (note that this step will download the HIGGS dataset, which is 2.6GB compressed, and 7.5GB
+uncompressed, so make sure you have enough disk space and are on a fast internet connection):
+```shell
+./prepare_data.sh
+```
+
+Start the NVFlare federated server:
+```shell
+/tmp/nvflare/poc/server/startup/start.sh
+```
+
+In another terminal, start the first worker:
+```shell
+/tmp/nvflare/poc/site-1/startup/start.sh
+```
+
+And the second worker:
+```shell
+/tmp/nvflare/poc/site-2/startup/start.sh
+```
+
+Then start the admin CLI:
+```shell
+/tmp/nvflare/poc/admin/startup/fl_admin.sh
+```
+
+In the admin CLI, run the following command:
+```shell
+submit_job vertical-xgboost
+```
+
+Once the training finishes, the model file should be written into
+`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
+respectively.
+
+Finally, shutdown everything from the admin CLI, using `admin` as password:
+```shell
+shutdown client
+shutdown server
+```
+
+## Training with GPUs
+
+Currently GPUs are not yet supported by vertical federated XGBoost.
diff --git a/demo/nvflare/vertical/custom/controller.py b/demo/nvflare/vertical/custom/controller.py
new file mode 100644
index 000000000..dd3e39f46
--- /dev/null
+++ b/demo/nvflare/vertical/custom/controller.py
@@ -0,0 +1,68 @@
+"""
+Example of training controller with NVFlare
+===========================================
+"""
+import multiprocessing
+
+from nvflare.apis.client import Client
+from nvflare.apis.fl_context import FLContext
+from nvflare.apis.impl.controller import Controller, Task
+from nvflare.apis.shareable import Shareable
+from nvflare.apis.signal import Signal
+from trainer import SupportedTasks
+
+import xgboost.federated
+
+
+class XGBoostController(Controller):
+    def __init__(self, port: int, world_size: int, server_key_path: str,
+                 server_cert_path: str, client_cert_path: str):
+        """Controller for federated XGBoost.
+
+        Args:
+            port: the port for the gRPC server to listen on.
+            world_size: the number of sites.
+            server_key_path: the path to the server key file.
+            server_cert_path: the path to the server certificate file.
+            client_cert_path: the path to the client certificate file.
+        """
+        super().__init__()
+        self._port = port
+        self._world_size = world_size
+        self._server_key_path = server_key_path
+        self._server_cert_path = server_cert_path
+        self._client_cert_path = client_cert_path
+        self._server = None
+
+    def start_controller(self, fl_ctx: FLContext):
+        self._server = multiprocessing.Process(
+            target=xgboost.federated.run_federated_server,
+            args=(self._port, self._world_size, self._server_key_path,
+                  self._server_cert_path, self._client_cert_path))
+        self._server.start()
+
+    def stop_controller(self, fl_ctx: FLContext):
+        if self._server:
+            self._server.terminate()
+
+    def process_result_of_unknown_task(self, client: Client, task_name: str,
+                                       client_task_id: str, result: Shareable,
+                                       fl_ctx: FLContext):
+        self.log_warning(fl_ctx, f"Unknown task: {task_name} from client {client.name}.")
+
+    def control_flow(self, abort_signal: Signal, fl_ctx: FLContext):
+        self.log_info(fl_ctx, "XGBoost training control flow started.")
+        if abort_signal.triggered:
+            return
+        task = Task(name=SupportedTasks.TRAIN, data=Shareable())
+        self.broadcast_and_wait(
+            task=task,
+            min_responses=self._world_size,
+            fl_ctx=fl_ctx,
+            wait_time_after_min_received=1,
+            abort_signal=abort_signal,
+        )
+        if abort_signal.triggered:
+            return
+
+        self.log_info(fl_ctx, "XGBoost training control flow finished.")
diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py
new file mode 100644
index 000000000..cd420129c
--- /dev/null
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -0,0 +1,97 @@
+import os
+
+from nvflare.apis.executor import Executor
+from nvflare.apis.fl_constant import FLContextKey, ReturnCode
+from nvflare.apis.fl_context import FLContext
+from nvflare.apis.shareable import Shareable, make_reply
+from nvflare.apis.signal import Signal
+
+import xgboost as xgb
+from xgboost import callback
+
+
+class SupportedTasks(object):
+    TRAIN = "train"
+
+
+class XGBoostTrainer(Executor):
+    def __init__(self, server_address: str, world_size: int, server_cert_path: str,
+                 client_key_path: str, client_cert_path: str):
+        """Trainer for federated XGBoost.
+
+        Args:
+            server_address: address for the gRPC server to connect to.
+            world_size: the number of sites.
+            server_cert_path: the path to the server certificate file.
+            client_key_path: the path to the client key file.
+            client_cert_path: the path to the client certificate file.
+        """
+        super().__init__()
+        self._server_address = server_address
+        self._world_size = world_size
+        self._server_cert_path = server_cert_path
+        self._client_key_path = client_key_path
+        self._client_cert_path = client_cert_path
+
+    def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
+                abort_signal: Signal) -> Shareable:
+        self.log_info(fl_ctx, f"Executing {task_name}")
+        try:
+            if task_name == SupportedTasks.TRAIN:
+                self._do_training(fl_ctx)
+                return make_reply(ReturnCode.OK)
+            else:
+                self.log_error(fl_ctx, f"{task_name} is not a supported task.")
+                return make_reply(ReturnCode.TASK_UNKNOWN)
+        except BaseException as e:
+            self.log_exception(fl_ctx,
+                               f"Task {task_name} failed. Exception: {e.__str__()}")
+            return make_reply(ReturnCode.EXECUTION_EXCEPTION)
+
+    def _do_training(self, fl_ctx: FLContext):
+        client_name = fl_ctx.get_prop(FLContextKey.CLIENT_NAME)
+        rank = int(client_name.split('-')[1]) - 1
+        communicator_env = {
+            'xgboost_communicator': 'federated',
+            'federated_server_address': self._server_address,
+            'federated_world_size': self._world_size,
+            'federated_rank': rank,
+            'federated_server_cert': self._server_cert_path,
+            'federated_client_key': self._client_key_path,
+            'federated_client_cert': self._client_cert_path
+        }
+        with xgb.collective.CommunicatorContext(**communicator_env):
+            # Load file, file will not be sharded in federated mode.
+            if rank == 0:
+                label = '&label_column=0'
+            else:
+                label = ''
+            dtrain = xgb.DMatrix(f'higgs.train.csv?format=csv{label}', data_split_mode=1)
+            dtest = xgb.DMatrix(f'higgs.test.csv?format=csv{label}', data_split_mode=1)
+
+            # specify parameters via map
+            param = {
+                'validate_parameters': True,
+                'eta': 0.1,
+                'gamma': 1.0,
+                'max_depth': 8,
+                'min_child_weight': 100,
+                'tree_method': 'approx',
+                'grow_policy': 'depthwise',
+                'objective': 'binary:logistic',
+                'eval_metric': 'auc',
+            }
+
+            # specify validations set to watch performance
+            watchlist = [(dtest, "eval"), (dtrain, "train")]
+            # number of boosting rounds
+            num_round = 10
+
+            bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=2)
+
+            # Save the model.
+            workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT)
+            run_number = fl_ctx.get_prop(FLContextKey.CURRENT_RUN)
+            run_dir = workspace.get_run_dir(run_number)
+            bst.save_model(os.path.join(run_dir, "higgs.model.federated.vertical.json"))
+            xgb.collective.communicator_print("Finished training\n")
diff --git a/demo/nvflare/vertical/prepare_data.sh b/demo/nvflare/vertical/prepare_data.sh
new file mode 100755
index 000000000..86ec3dfa2
--- /dev/null
+++ b/demo/nvflare/vertical/prepare_data.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -e
+
+rm -fr ./*.pem /tmp/nvflare/poc
+
+world_size=2
+
+# Generate server and client certificates.
+openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out server-cert.pem -subj "/C=US/CN=localhost"
+openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
+
+# Download HIGGS dataset.
+if [ -f "HIGGS.csv" ]; then
+  echo "HIGGS.csv exists, skipping download."
+else
+  echo "Downloading HIGGS dataset."
+  wget https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
+  gunzip HIGGS.csv.gz
+fi
+
+# Split into train/test.
+if [[ -f higgs.train.csv && -f higgs.test.csv ]]; then
+  echo "higgs.train.csv and higgs.test.csv exist, skipping split."
+else
+  echo "Splitting HIGGS dataset into train/test."
+  head -n 10450000 HIGGS.csv > higgs.train.csv
+  tail -n 550000 HIGGS.csv > higgs.test.csv
+fi
+
+# Split train and test files by column to simulate a federated environment.
+site_files=(higgs.{train,test}.csv-site-*)
+if [ ${#site_files[@]} -eq $((world_size*2)) ]; then
+  echo "Site files exist, skipping split."
+else
+  echo "Splitting train/test into site files."
+  total_cols=28  # plus label
+  cols=$((total_cols/world_size))
+  echo "Columns per site: $cols"
+  for (( site=1; site<=world_size; site++ )); do
+    if (( site == 1 )); then
+      start=$((cols*(site-1)+1))
+    else
+      start=$((cols*(site-1)+2))
+    fi
+    if (( site == world_size )); then
+      end=$((total_cols+1))
+    else
+      end=$((cols*site+1))
+    fi
+    echo "Site $site, columns $start-$end"
+    cut -d, -f${start}-${end} higgs.train.csv > higgs.train.csv-site-"${site}"
+    cut -d, -f${start}-${end} higgs.test.csv > higgs.test.csv-site-"${site}"
+  done
+fi
+
+nvflare poc -n 2 --prepare
+mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
+for (( site=1; site<=world_size; site++ )); do
+  cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/
+  ln -s "${PWD}"/higgs.train.csv-site-"${site}" /tmp/nvflare/poc/site-"${site}"/higgs.train.csv
+  ln -s "${PWD}"/higgs.test.csv-site-"${site}" /tmp/nvflare/poc/site-"${site}"/higgs.test.csv
+done

From 17ff471616ae2d4598d1143435236c2e0c191861 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 28 Apr 2023 18:01:58 +0800
Subject: [PATCH 31/34] Optimize array interface input. (#9090)

---
 src/common/error_msg.h     |  4 ++
 src/data/array_interface.h | 84 +++++++++++++++++++++++++++++++++++---
 src/data/data.cc           | 11 +++--
 3 files changed, 89 insertions(+), 10 deletions(-)

diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 3dbb7f52c..4415bf2ee 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -24,5 +24,9 @@ constexpr StringView LabelScoreSize() {
 constexpr StringView InfInData() {
   return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
 }
+
+constexpr StringView NoF128() {
+  return "128-bit floating point is not supported on current platform.";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index e9045899b..fee22203c 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -7,8 +7,9 @@
 #define XGBOOST_DATA_ARRAY_INTERFACE_H_
 
 #include <algorithm>
-#include <cstddef>  // std::size_t
+#include <cstddef>  // for size_t
 #include <cstdint>
+#include <limits>  // for numeric_limits
 #include <map>
 #include <string>
 #include <type_traits>  // std::alignment_of,std::remove_pointer_t
@@ -17,6 +18,7 @@
 
 #include "../common/bitfield.h"
 #include "../common/common.h"
+#include "../common/error_msg.h"  // for NoF128
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"
@@ -454,9 +456,8 @@ class ArrayInterface {
   void AssignType(StringView typestr) {
     using T = ArrayInterfaceHandler::Type;
     if (typestr.size() == 4 && typestr[1] == 'f' && typestr[2] == '1' && typestr[3] == '6') {
+      CHECK(sizeof(long double) == 16) << error::NoF128();
       type = T::kF16;
-      CHECK(sizeof(long double) == 16)
-          << "128-bit floating point is not supported on current platform.";
     } else if (typestr[1] == 'f' && typestr[2] == '2') {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
       type = T::kF2;
@@ -572,19 +573,90 @@ class ArrayInterface {
   // Used only by columnar format.
   RBitField8 valid;
   // Array stride
-  size_t strides[D]{0};
+  std::size_t strides[D]{0};
   // Array shape
-  size_t shape[D]{0};
+  std::size_t shape[D]{0};
   // Type earsed pointer referencing the data.
   void const *data{nullptr};
   // Total number of items
-  size_t n{0};
+  std::size_t n{0};
   // Whether the memory is c-contiguous
   bool is_contiguous{false};
   // RTTI, initialized to the f16 to avoid masking potential bugs in initialization.
   ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16};
 };
 
+template <std::int32_t D, typename Fn>
+void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
+  // Only used for cuDF at the moment.
+  CHECK_EQ(array.valid.Size(), 0);
+  auto dispatch = [&](auto t) {
+    using T = std::remove_const_t<decltype(t)> const;
+    // Set the data size to max as we don't know the original size of a sliced array:
+    //
+    // Slicing an array A with shape (4, 2, 3) and stride (6, 3, 1) by [:, 1, :] results
+    // in an array B with shape (4, 3) and strides (6, 1). We can't calculate the original
+    // size 24 based on the slice.
+    fn(linalg::TensorView<T, D>{common::Span<T const>{static_cast<T *>(array.data),
+                                                      std::numeric_limits<std::size_t>::max()},
+                                array.shape, array.strides, device});
+  };
+  switch (array.type) {
+    case ArrayInterfaceHandler::kF2: {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+      dispatch(__half{});
+#endif
+      break;
+    }
+    case ArrayInterfaceHandler::kF4: {
+      dispatch(float{});
+      break;
+    }
+    case ArrayInterfaceHandler::kF8: {
+      dispatch(double{});
+      break;
+    }
+    case ArrayInterfaceHandler::kF16: {
+      using T = long double;
+      CHECK(sizeof(long double) == 16) << error::NoF128();
+      dispatch(T{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI1: {
+      dispatch(std::int8_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI2: {
+      dispatch(std::int16_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI4: {
+      dispatch(std::int32_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI8: {
+      dispatch(std::int64_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU1: {
+      dispatch(std::uint8_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU2: {
+      dispatch(std::uint16_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU4: {
+      dispatch(std::uint32_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU8: {
+      dispatch(std::uint64_t{});
+      break;
+    }
+  }
+}
+
 /**
  * \brief Helper for type casting.
  */
diff --git a/src/data/data.cc b/src/data/data.cc
index 9f85e7db2..236bd9131 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -427,10 +427,13 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
     return;
   }
   p_out->Reshape(array.shape);
-  auto t = p_out->View(Context::kCpuId);
-  CHECK(t.CContiguous());
-  linalg::ElementWiseTransformHost(t, ctx.Threads(), [&](auto i, auto) {
-    return linalg::detail::Apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, t.Shape()));
+  auto t_out = p_out->View(Context::kCpuId);
+  CHECK(t_out.CContiguous());
+  auto const shape = t_out.Shape();
+  DispatchDType(array, Context::kCpuId, [&](auto&& in) {
+    linalg::ElementWiseTransformHost(t_out, ctx.Threads(), [&](auto i, auto) {
+      return std::apply(in, linalg::UnravelIndex<D>(i, shape));
+    });
   });
 }
 }  // namespace

From e9220043293cee1f890332ac1a49112054b6737c Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Fri, 28 Apr 2023 19:43:58 +0800
Subject: [PATCH 32/34] [doc] fix the cudf installation [skip ci] (#9106)

---
 doc/tutorials/spark_estimator.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst
index 02ddb60ea..fb69b70e1 100644
--- a/doc/tutorials/spark_estimator.rst
+++ b/doc/tutorials/spark_estimator.rst
@@ -108,8 +108,8 @@ virtualenv and pip:
   python -m venv xgboost_env
   source xgboost_env/bin/activate
   pip install pyarrow pandas venv-pack xgboost
-  # https://rapids.ai/pip.html#install
-  pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+  # https://docs.rapids.ai/install#pip-install
+  pip install cudf-cu11 --extra-index-url=https://pypi.nvidia.com
   venv-pack -o xgboost_env.tar.gz
 
 With Conda:
@@ -241,7 +241,7 @@ additional spark configurations and dependencies:
     --master spark://<master-ip>:7077 \
     --conf spark.executor.resource.gpu.amount=1 \
     --conf spark.task.resource.gpu.amount=1 \
-    --packages com.nvidia:rapids-4-spark_2.12:22.08.0 \
+    --packages com.nvidia:rapids-4-spark_2.12:23.04.0 \
     --conf spark.plugins=com.nvidia.spark.SQLPlugin \
     --conf spark.sql.execution.arrow.maxRecordsPerBatch=1000000 \
     --archives xgboost_env.tar.gz#environment \

From 1f9a57d17b4d351de7ae14aa23b13e4d490ed7d0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 28 Apr 2023 19:45:15 +0800
Subject: [PATCH 33/34] [Breaking] Require format to be specified in input URI.
 (#9077)

Previously, we use `libsvm` as default when format is not specified. However, the dmlc
data parser is not particularly robust against errors, and the most common type of error
is undefined format.

Along with which, we will recommend users to use other data loader instead. We will
continue the maintenance of the parsers as it's currently used for many internal tests
including federated learning.
---
 R-package/tests/testthat/test_dmatrix.R       |  2 +-
 demo/CLI/binary_classification/mushroom.conf  |  6 +-
 demo/CLI/regression/machine.conf              |  6 +-
 demo/c-api/basic/c-api-demo.c                 |  4 +-
 demo/guide-python/boost_from_prediction.py    | 16 +++--
 demo/guide-python/cross_validation.py         | 62 +++++++++++------
 demo/guide-python/evals_result.py             | 35 ++++++----
 demo/guide-python/generalized_linear_model.py | 26 +++++--
 demo/guide-python/predict_first_ntree.py      |  4 +-
 demo/guide-python/predict_leaf_indices.py     | 14 ++--
 doc/tutorials/external_memory.rst             |  6 +-
 doc/tutorials/input_format.rst                |  7 +-
 include/xgboost/c_api.h                       |  6 +-
 include/xgboost/data.h                        | 12 ++--
 .../java/example/BasicWalkThrough.java        |  9 +--
 .../java/example/BoostFromPrediction.java     |  4 +-
 .../java/example/CrossValidation.java         |  2 +-
 .../java/example/CustomObjective.java         |  4 +-
 .../xgboost4j/java/example/EarlyStopping.java |  4 +-
 .../java/example/ExternalMemory.java          |  4 +-
 .../java/example/GeneralizedLinearModel.java  |  4 +-
 .../java/example/PredictFirstNtree.java       |  4 +-
 .../java/example/PredictLeafIndices.java      |  4 +-
 .../scala/example/BasicWalkThrough.scala      |  8 +--
 .../scala/example/BoostFromPrediction.scala   |  4 +-
 .../scala/example/CrossValidation.scala       |  2 +-
 .../scala/example/CustomObjective.scala       |  4 +-
 .../scala/example/ExternalMemory.scala        |  4 +-
 .../example/GeneralizedLinearModel.scala      |  4 +-
 .../scala/example/PredictFirstNTree.scala     |  4 +-
 .../scala/example/PredictLeafIndices.scala    |  4 +-
 .../dmlc/xgboost4j/java/BoosterImplTest.java  |  4 +-
 .../ml/dmlc/xgboost4j/java/DMatrixTest.java   |  4 +-
 .../dmlc/xgboost4j/scala/DMatrixSuite.scala   |  2 +-
 .../scala/ScalaBoosterImplSuite.scala         | 40 +++++------
 python-package/xgboost/testing/__init__.py    |  7 ++
 src/data/data.cc                              | 68 ++++++-------------
 src/data/file_iterator.h                      | 51 ++++++++++----
 tests/cpp/common/test_hist_util.h             |  3 +-
 tests/cpp/data/test_file_iterator.cc          |  8 +--
 tests/cpp/data/test_metainfo.cc               |  5 +-
 tests/cpp/data/test_simple_dmatrix.cc         | 12 ++--
 tests/cpp/data/test_sparse_page_dmatrix.cc    | 17 +++--
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  2 +-
 tests/cpp/helpers.cc                          |  2 +-
 tests/cpp/test_learner.cc                     |  3 +-
 tests/python/test_basic.py                    | 18 ++---
 tests/python/test_basic_models.py             | 24 +++----
 tests/python/test_callback.py                 |  8 +--
 tests/python/test_dmatrix.py                  |  6 +-
 tests/python/test_interaction_constraints.py  |  8 ++-
 tests/python/test_monotone_constraints.py     |  4 +-
 tests/python/test_openmp.py                   |  4 +-
 tests/python/test_parse_tree.py               |  2 +-
 tests/python/test_plotting.py                 |  4 +-
 tests/python/test_shap.py                     |  4 +-
 tests/python/test_updaters.py                 |  4 +-
 tests/python/test_with_pandas.py              |  2 +-
 58 files changed, 327 insertions(+), 268 deletions(-)

diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 1d8cb0f23..21d39f255 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -72,7 +72,7 @@ test_that("xgb.DMatrix: saving, loading", {
   tmp <- c("0 1:1 2:1", "1 3:1", "0 1:1")
   tmp_file <- tempfile(fileext = ".libsvm")
   writeLines(tmp, tmp_file)
-  dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE)
+  dtest4 <- xgb.DMatrix(paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE)
   expect_equal(dim(dtest4), c(3, 4))
   expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0))
 
diff --git a/demo/CLI/binary_classification/mushroom.conf b/demo/CLI/binary_classification/mushroom.conf
index 3cf865465..d78199cd7 100644
--- a/demo/CLI/binary_classification/mushroom.conf
+++ b/demo/CLI/binary_classification/mushroom.conf
@@ -20,10 +20,10 @@ num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 2
 # The path of training data
-data = "agaricus.txt.train"
+data = "agaricus.txt.train?format=libsvm"
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "agaricus.txt.test"
+eval[test] = "agaricus.txt.test?format=libsvm"
 # evaluate on training data as well each round
 eval_train = 1
 # The path of test data
-test:data = "agaricus.txt.test"
+test:data = "agaricus.txt.test?format=libsvm"
diff --git a/demo/CLI/regression/machine.conf b/demo/CLI/regression/machine.conf
index 4ba8437d5..42e2b1227 100644
--- a/demo/CLI/regression/machine.conf
+++ b/demo/CLI/regression/machine.conf
@@ -21,8 +21,8 @@ num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0
 # The path of training data
-data = "machine.txt.train"
+data = "machine.txt.train?format=libsvm"
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
-eval[test] = "machine.txt.test"
+eval[test] = "machine.txt.test?format=libsvm"
 # The path of test data
-test:data = "machine.txt.test"
+test:data = "machine.txt.test?format=libsvm"
diff --git a/demo/c-api/basic/c-api-demo.c b/demo/c-api/basic/c-api-demo.c
index ca6e689aa..15a224e9e 100644
--- a/demo/c-api/basic/c-api-demo.c
+++ b/demo/c-api/basic/c-api-demo.c
@@ -42,8 +42,8 @@ int main() {
 
   // load the data
   DMatrixHandle dtrain, dtest;
-  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train", silent, &dtrain));
-  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test", silent, &dtest));
+  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.train?format=libsvm", silent, &dtrain));
+  safe_xgboost(XGDMatrixCreateFromFile("../../data/agaricus.txt.test?format=libsvm", silent, &dtest));
 
   // create the booster
   BoosterHandle booster;
diff --git a/demo/guide-python/boost_from_prediction.py b/demo/guide-python/boost_from_prediction.py
index 53a45549a..13f91d7c8 100644
--- a/demo/guide-python/boost_from_prediction.py
+++ b/demo/guide-python/boost_from_prediction.py
@@ -7,15 +7,19 @@ import os
 import xgboost as xgb
 
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 ###
 # advanced: start from a initial base prediction
 #
-print('start running example to start from a initial prediction')
+print("start running example to start from a initial prediction")
 # specify parameters via map, definition are same as c++ version
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
 # train xgboost for 1 round
 bst = xgb.train(param, dtrain, 1, watchlist)
 # Note: we need the margin value instead of transformed prediction in
@@ -27,5 +31,5 @@ ptest = bst.predict(dtest, output_margin=True)
 dtrain.set_base_margin(ptrain)
 dtest.set_base_margin(ptest)
 
-print('this is result of running from initial prediction')
+print("this is result of running from initial prediction")
 bst = xgb.train(param, dtrain, 1, watchlist)
diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py
index 2565b02c9..4e537108a 100644
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@@ -10,27 +10,45 @@ import xgboost as xgb
 
 # load data in do training
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic'}
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
 num_round = 2
 
-print('running cross validation')
+print("running cross validation")
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'error'}, seed=0,
-       callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)])
+xgb.cv(
+    param,
+    dtrain,
+    num_round,
+    nfold=5,
+    metrics={"error"},
+    seed=0,
+    callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)],
+)
 
-print('running cross validation, disable standard deviation display')
+print("running cross validation, disable standard deviation display")
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value
-res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
-             metrics={'error'}, seed=0,
-             callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
-                        xgb.callback.EarlyStopping(3)])
+res = xgb.cv(
+    param,
+    dtrain,
+    num_boost_round=10,
+    nfold=5,
+    metrics={"error"},
+    seed=0,
+    callbacks=[
+        xgb.callback.EvaluationMonitor(show_stdv=False),
+        xgb.callback.EarlyStopping(3),
+    ],
+)
 print(res)
-print('running cross validation, with preprocessing function')
+print("running cross validation, with preprocessing function")
+
+
 # define the preprocessing function
 # used to return the preprocessed training, test data, and parameter
 # we can use this to do weight rescale, etc.
@@ -38,32 +56,36 @@ print('running cross validation, with preprocessing function')
 def fpreproc(dtrain, dtest, param):
     label = dtrain.get_label()
     ratio = float(np.sum(label == 0)) / np.sum(label == 1)
-    param['scale_pos_weight'] = ratio
+    param["scale_pos_weight"] = ratio
     return (dtrain, dtest, param)
 
+
 # do cross validation, for each fold
 # the dtrain, dtest, param will be passed into fpreproc
 # then the return value of fpreproc will be used to generate
 # results of that fold
-xgb.cv(param, dtrain, num_round, nfold=5,
-       metrics={'auc'}, seed=0, fpreproc=fpreproc)
+xgb.cv(param, dtrain, num_round, nfold=5, metrics={"auc"}, seed=0, fpreproc=fpreproc)
 
 ###
 # you can also do cross validation with customized loss function
 # See custom_objective.py
 ##
-print('running cross validation, with customized loss function')
+print("running cross validation, with customized loss function")
+
+
 def logregobj(preds, dtrain):
     labels = dtrain.get_label()
     preds = 1.0 / (1.0 + np.exp(-preds))
     grad = preds - labels
     hess = preds * (1.0 - preds)
     return grad, hess
+
+
 def evalerror(preds, dtrain):
     labels = dtrain.get_label()
-    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+    return "error", float(sum(labels != (preds > 0.0))) / len(labels)
 
-param = {'max_depth':2, 'eta':1}
+
+param = {"max_depth": 2, "eta": 1}
 # train with customized objective
-xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
-       obj=logregobj, feval=evalerror)
+xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror)
diff --git a/demo/guide-python/evals_result.py b/demo/guide-python/evals_result.py
index bba8862f5..7b9da96da 100644
--- a/demo/guide-python/evals_result.py
+++ b/demo/guide-python/evals_result.py
@@ -7,28 +7,37 @@ import os
 import xgboost as xgb
 
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
 
-param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]
+param = [
+    ("max_depth", 2),
+    ("objective", "binary:logistic"),
+    ("eval_metric", "logloss"),
+    ("eval_metric", "error"),
+]
 
 num_round = 2
-watchlist = [(dtest,'eval'), (dtrain,'train')]
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 
 evals_result = {}
 bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result)
 
-print('Access logloss metric directly from evals_result:')
-print(evals_result['eval']['logloss'])
+print("Access logloss metric directly from evals_result:")
+print(evals_result["eval"]["logloss"])
 
-print('')
-print('Access metrics through a loop:')
+print("")
+print("Access metrics through a loop:")
 for e_name, e_mtrs in evals_result.items():
-    print('- {}'.format(e_name))
+    print("- {}".format(e_name))
     for e_mtr_name, e_mtr_vals in e_mtrs.items():
-        print('   - {}'.format(e_mtr_name))
-        print('      - {}'.format(e_mtr_vals))
+        print("   - {}".format(e_mtr_name))
+        print("      - {}".format(e_mtr_vals))
 
-print('')
-print('Access complete dictionary:')
+print("")
+print("Access complete dictionary:")
 print(evals_result)
diff --git a/demo/guide-python/generalized_linear_model.py b/demo/guide-python/generalized_linear_model.py
index 976428f13..3387b1982 100644
--- a/demo/guide-python/generalized_linear_model.py
+++ b/demo/guide-python/generalized_linear_model.py
@@ -11,14 +11,22 @@ import xgboost as xgb
 #  basically, we are using linear model, instead of tree for our boosters
 ##
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
 # change booster to gblinear, so that we are fitting a linear model
 # alpha is the L1 regularizer
 # lambda is the L2 regularizer
 # you can also set lambda_bias which is L2 regularizer on the bias term
-param = {'objective':'binary:logistic', 'booster':'gblinear',
-         'alpha': 0.0001, 'lambda': 1}
+param = {
+    "objective": "binary:logistic",
+    "booster": "gblinear",
+    "alpha": 0.0001,
+    "lambda": 1,
+}
 
 # normally, you do not need to set eta (step_size)
 # XGBoost uses a parallel coordinate descent algorithm (shotgun),
@@ -29,9 +37,15 @@ param = {'objective':'binary:logistic', 'booster':'gblinear',
 ##
 # the rest of settings are the same
 ##
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 num_round = 4
 bst = xgb.train(param, dtrain, num_round, watchlist)
 preds = bst.predict(dtest)
 labels = dtest.get_label()
-print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))))
+print(
+    "error=%f"
+    % (
+        sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i])
+        / float(len(preds))
+    )
+)
diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py
index 55f7c61af..78137b4e1 100644
--- a/demo/guide-python/predict_first_ntree.py
+++ b/demo/guide-python/predict_first_ntree.py
@@ -16,8 +16,8 @@ test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test")
 
 def native_interface():
     # load data in do training
-    dtrain = xgb.DMatrix(train)
-    dtest = xgb.DMatrix(test)
+    dtrain = xgb.DMatrix(train + "?format=libsvm")
+    dtest = xgb.DMatrix(test + "?format=libsvm")
     param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
     watchlist = [(dtest, "eval"), (dtrain, "train")]
     num_round = 3
diff --git a/demo/guide-python/predict_leaf_indices.py b/demo/guide-python/predict_leaf_indices.py
index 45cc8fa7f..627619724 100644
--- a/demo/guide-python/predict_leaf_indices.py
+++ b/demo/guide-python/predict_leaf_indices.py
@@ -8,14 +8,18 @@ import xgboost as xgb
 
 # load data in do training
 CURRENT_DIR = os.path.dirname(__file__)
-dtrain = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.train'))
-dtest = xgb.DMatrix(os.path.join(CURRENT_DIR, '../data/agaricus.txt.test'))
-param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
-watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+dtrain = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.train?format=libsvm")
+)
+dtest = xgb.DMatrix(
+    os.path.join(CURRENT_DIR, "../data/agaricus.txt.test?format=libsvm")
+)
+param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
+watchlist = [(dtest, "eval"), (dtrain, "train")]
 num_round = 3
 bst = xgb.train(param, dtrain, num_round, watchlist)
 
-print('start testing predict the leaf indices')
+print("start testing predict the leaf indices")
 # predict using first 2 tree
 leafindex = bst.predict(
     dtest, iteration_range=(0, 2), pred_leaf=True, strict_shape=True
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 3b96cfe92..006d63b43 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -77,7 +77,7 @@ The external memory version takes in the following `URI <https://en.wikipedia.or
 
 .. code-block:: none
 
-  filename#cacheprefix
+  filename?format=libsvm#cacheprefix
 
 The ``filename`` is the normal path to LIBSVM format file you want to load in, and
 ``cacheprefix`` is a path to a cache file that XGBoost will use for caching preprocessed
@@ -97,13 +97,13 @@ you have a dataset stored in a file similar to ``agaricus.txt.train`` with LIBSV
 
 .. code-block:: python
 
-  dtrain = DMatrix('../data/agaricus.txt.train#dtrain.cache')
+  dtrain = DMatrix('../data/agaricus.txt.train?format=libsvm#dtrain.cache')
 
 XGBoost will first load ``agaricus.txt.train`` in, preprocess it, then write to a new file named
 ``dtrain.cache`` as an on disk cache for storing preprocessed data in an internal binary format.  For
 more notes about text input formats, see :doc:`/tutorials/input_format`.
 
-For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train#dtrain.cache"``.
+For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
 
 
 **********************************
diff --git a/doc/tutorials/input_format.rst b/doc/tutorials/input_format.rst
index 923a82650..ab0158e13 100644
--- a/doc/tutorials/input_format.rst
+++ b/doc/tutorials/input_format.rst
@@ -2,10 +2,15 @@
 Text Input Format of DMatrix
 ############################
 
+.. _basic_input_format:
+
+Here we will briefly describe the text input formats for XGBoost. However, for users with access to a supported language environment like Python or R, it's recommended to use data parsers from that ecosystem instead. For instance, :py:func:`sklearn.datasets.load_svmlight_file`.
+
 ******************
 Basic Input Format
 ******************
-XGBoost currently supports two text formats for ingesting data: LIBSVM and CSV. The rest of this document will describe the LIBSVM format. (See `this Wikipedia article <https://en.wikipedia.org/wiki/Comma-separated_values>`_ for a description of the CSV format.).  Please be careful that, XGBoost does **not** understand file extensions, nor try to guess the file format, as there is no universal agreement upon file extension of LIBSVM or CSV.  Instead it employs `URI <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format for specifying the precise input file type.  For example if you provide a `csv` file ``./data.train.csv`` as input, XGBoost will blindly use the default LIBSVM parser to digest it and generate a parser error.  Instead, users need to provide an URI in the form of ``train.csv?format=csv``.  For external memory input, the URI should of a form similar to ``train.csv?format=csv#dtrain.cache``.  See :ref:`python_data_interface` and :doc:`/tutorials/external_memory` also.
+
+XGBoost currently supports two text formats for ingesting data: LIBSVM and CSV. The rest of this document will describe the LIBSVM format. (See `this Wikipedia article <https://en.wikipedia.org/wiki/Comma-separated_values>`_ for a description of the CSV format.).  Please be careful that, XGBoost does **not** understand file extensions, nor try to guess the file format, as there is no universal agreement upon file extension of LIBSVM or CSV.  Instead it employs `URI <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format for specifying the precise input file type.  For example if you provide a `csv` file ``./data.train.csv`` as input, XGBoost will blindly use the default LIBSVM parser to digest it and generate a parser error.  Instead, users need to provide an URI in the form of ``train.csv?format=csv`` or ``train.csv?format=libsvm``.  For external memory input, the URI should of a form similar to ``train.csv?format=csv#dtrain.cache``.  See :ref:`python_data_interface` and :doc:`/tutorials/external_memory` also.
 
 For training or predicting, XGBoost takes an instance file with the format as below:
 
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index e56680780..4b9d37335 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -138,7 +138,11 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
 /*!
  * \brief load a data matrix
  * \param config JSON encoded parameters for DMatrix construction.  Accepted fields are:
- *   - uri: The URI of the input file.
+
+ *   - uri: The URI of the input file. The URI parameter `format` is required when loading text data.
+ *          \verbatim embed:rst:leading-asterisk
+ *            See :doc:`/tutorials/input_format` for more info.
+ *          \endverbatim
  *   - silent (optional): Whether to print message during loading. Default to true.
  *   - data_split_mode (optional): Whether to split by row or column. In distributed mode, the
  *     file is split accordingly; otherwise this is only an indicator on how the file was split
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index fe22fb2b5..3f7b6ad85 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -566,21 +566,17 @@ class DMatrix {
     return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
   }
 
-  /*!
+  /**
    * \brief Load DMatrix from URI.
+   *
    * \param uri The URI of input.
    * \param silent Whether print information during loading.
    * \param data_split_mode In distributed mode, split the input according this mode; otherwise,
    *                        it's just an indicator on how the input was split beforehand.
-   * \param file_format The format type of the file, used for dmlc::Parser::Create.
-   *   By default "auto" will be able to load in both local binary file.
-   * \param page_size Page size for external memory.
    * \return The created DMatrix.
    */
-  static DMatrix* Load(const std::string& uri,
-                       bool silent = true,
-                       DataSplitMode data_split_mode = DataSplitMode::kRow,
-                       const std::string& file_format = "auto");
+  static DMatrix* Load(const std::string& uri, bool silent = true,
+                       DataSplitMode data_split_mode = DataSplitMode::kRow);
 
   /**
    * \brief Creates a new DMatrix from an external data adapter.
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java
index 7e4fe6806..8a74b74da 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2021 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -62,8 +62,8 @@ public class BasicWalkThrough {
 
   public static void main(String[] args) throws IOException, XGBoostError {
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     HashMap<String, Object> params = new HashMap<String, Object>();
     params.put("eta", 1.0);
@@ -112,7 +112,8 @@ public class BasicWalkThrough {
 
     System.out.println("start build dmatrix from csr sparse data ...");
     //build dmatrix from CSR Sparse Matrix
-    DataLoader.CSRSparseData spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train");
+    DataLoader.CSRSparseData spData =
+        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm");
 
     DMatrix trainMat2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data,
                                     DMatrix.SparseType.CSR, 127);
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java
index 7eb9e99f0..fe5db0465 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BoostFromPrediction.java
@@ -32,8 +32,8 @@ public class BoostFromPrediction {
     System.out.println("start running example to start from a initial prediction");
 
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java
index dbe5f368c..3577be226 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CrossValidation.java
@@ -30,7 +30,7 @@ import ml.dmlc.xgboost4j.java.XGBoostError;
 public class CrossValidation {
   public static void main(String[] args) throws IOException, XGBoostError {
     //load train mat
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
 
     //set params
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java
index 6d529974c..c631dc01a 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/CustomObjective.java
@@ -139,9 +139,9 @@ public class CustomObjective {
 
   public static void main(String[] args) throws XGBoostError {
     //load train mat (svmlight format)
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
     //load valid mat (svmlight format)
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     HashMap<String, Object> params = new HashMap<String, Object>();
     params.put("eta", 1.0);
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java
index 61e752f85..9e52c12fd 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java
@@ -29,9 +29,9 @@ import ml.dmlc.xgboost4j.java.example.util.DataLoader;
 public class EarlyStopping {
   public static void main(String[] args) throws IOException, XGBoostError {
     DataLoader.CSRSparseData trainCSR =
-        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train");
+        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm");
     DataLoader.CSRSparseData testCSR =
-        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test");
+        DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test?format=libsvm");
 
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
index 349098ae1..70b2b85b5 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/ExternalMemory.java
@@ -32,8 +32,8 @@ public class ExternalMemory {
     //this is the only difference, add a # followed by a cache prefix name
     //several cache file with the prefix will be generated
     //currently only support convert from libsvm file
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java
index 422cdea6a..09cc91c7f 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/GeneralizedLinearModel.java
@@ -32,8 +32,8 @@ import ml.dmlc.xgboost4j.java.example.util.CustomEval;
 public class GeneralizedLinearModel {
   public static void main(String[] args) throws XGBoostError {
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     //specify parameters
     //change booster to gblinear, so that we are fitting a linear model
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java
index c98534a93..9038502bd 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictFirstNtree.java
@@ -31,8 +31,8 @@ import ml.dmlc.xgboost4j.java.example.util.CustomEval;
 public class PredictFirstNtree {
   public static void main(String[] args) throws XGBoostError {
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java
index 0fcfb39de..7b1dfcb28 100644
--- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java
+++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/PredictLeafIndices.java
@@ -31,8 +31,8 @@ import ml.dmlc.xgboost4j.java.XGBoostError;
 public class PredictLeafIndices {
   public static void main(String[] args) throws XGBoostError {
     // load file from text file, also binary buffer generated by xgboost4j
-    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train");
-    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test");
+    DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm");
+    DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm");
 
     //specify parameters
     HashMap<String, Object> params = new HashMap<String, Object>();
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala
index e8481b047..1893288b4 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -36,8 +36,8 @@ object BasicWalkThrough {
   }
 
   def main(args: Array[String]): Unit = {
-    val trainMax = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMax = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMax = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMax = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
@@ -76,7 +76,7 @@ object BasicWalkThrough {
 
     // build dmatrix from CSR Sparse Matrix
     println("start build dmatrix from csr sparse data ...")
-    val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train")
+    val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm")
     val trainMax2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data,
       JDMatrix.SparseType.CSR)
     trainMax2.setLabel(spData.labels)
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala
index b894532fa..09b72fc50 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BoostFromPrediction.scala
@@ -24,8 +24,8 @@ object BoostFromPrediction {
   def main(args: Array[String]): Unit = {
     println("start running example to start from a initial prediction")
 
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala
index 62f8b461a..6083209ec 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CrossValidation.scala
@@ -21,7 +21,7 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix}
 
 object CrossValidation {
   def main(args: Array[String]): Unit = {
-    val trainMat: DMatrix = new DMatrix("../../demo/data/agaricus.txt.train")
+    val trainMat: DMatrix = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
 
     // set params
     val params = new mutable.HashMap[String, Any]
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala
index fe88423e7..8cc49c90d 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/CustomObjective.scala
@@ -138,8 +138,8 @@ object CustomObjective {
   }
 
   def main(args: Array[String]): Unit = {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
     params += "max_depth" -> 2
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala
index 447c98295..c7f3d8bbb 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/ExternalMemory.scala
@@ -25,8 +25,8 @@ object ExternalMemory {
     // this is the only difference, add a # followed by a cache prefix name
     // several cache file with the prefix will be generated
     // currently only support convert from libsvm file
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train#dtrain.cache")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test#dtest.cache")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm#dtrain.cache")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm#dtest.cache")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala
index 27ed98eca..e370010b6 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/GeneralizedLinearModel.scala
@@ -27,8 +27,8 @@ import ml.dmlc.xgboost4j.scala.example.util.CustomEval
  */
 object GeneralizedLinearModel {
   def main(args: Array[String]): Unit = {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     // specify parameters
     // change booster to gblinear, so that we are fitting a linear model
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala
index 5395e3638..40a5ffc44 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictFirstNTree.scala
@@ -23,8 +23,8 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix}
 object PredictFirstNTree {
 
   def main(args: Array[String]): Unit = {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala
index f40a8aac6..7ae2e6520 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/PredictLeafIndices.scala
@@ -25,8 +25,8 @@ import ml.dmlc.xgboost4j.scala.{XGBoost, DMatrix}
 object PredictLeafIndices {
 
   def main(args: Array[String]): Unit = {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val params = new mutable.HashMap[String, Any]()
     params += "eta" -> 1.0
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index cce1254d0..20a243f5b 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -30,8 +30,8 @@ import org.junit.Test;
  * @author hzx
  */
 public class BoosterImplTest {
-  private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1";
-  private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1";
+  private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1&format=libsvm";
+  private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1&format=libsvm";
 
   public static class EvalError implements IEvaluation {
     @Override
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
index cf174c6dd..d658c5529 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/DMatrixTest.java
@@ -4,7 +4,7 @@
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
-    
+
  http://www.apache.org/licenses/LICENSE-2.0
 
  Unless required by applicable law or agreed to in writing, software
@@ -88,7 +88,7 @@ public class DMatrixTest {
   public void testCreateFromFile() throws XGBoostError {
     //create DMatrix from file
     String filePath = writeResourceIntoTempFile("/agaricus.txt.test");
-    DMatrix dmat = new DMatrix(filePath);
+    DMatrix dmat = new DMatrix(filePath + "?format=libsvm");
     //get label
     float[] labels = dmat.getLabel();
     //check length
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
index 05c6856f7..53325effa 100644
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala
@@ -25,7 +25,7 @@ import ml.dmlc.xgboost4j.java.{DMatrix => JDMatrix}
 
 class DMatrixSuite extends AnyFunSuite {
   test("create DMatrix from File") {
-    val dmat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val dmat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     // get label
     val labels: Array[Float] = dmat.getLabel
     // check length
diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
index 8cac9fe4f..2eda1fa2d 100644
--- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
+++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/ScalaBoosterImplSuite.scala
@@ -95,8 +95,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("basic operation of booster") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
 
     val booster = trainBooster(trainMat, testMat)
     val predicts = booster.predict(testMat, true)
@@ -106,8 +106,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
 
   test("save/load model with path") {
 
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val eval = new EvalError
     val booster = trainBooster(trainMat, testMat)
     // save and load
@@ -123,8 +123,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("save/load model with stream") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val eval = new EvalError
     val booster = trainBooster(trainMat, testMat)
     // save and load
@@ -139,7 +139,7 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("cross validation") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
     val params = List("eta" -> "1.0", "max_depth" -> "3", "silent" -> "1", "nthread" -> "6",
       "objective" -> "binary:logistic", "gamma" -> "1.0", "eval_metric" -> "error").toMap
     val round = 2
@@ -148,8 +148,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("test with quantile histo depthwise") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "3", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "eval_metric" -> "auc").toMap
@@ -158,8 +158,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("test with quantile histo lossguide") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "3", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "lossguide", "max_leaves" -> "8", "eval_metric" -> "auc").toMap
@@ -168,8 +168,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("test with quantile histo lossguide with max bin") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "3", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "lossguide", "max_leaves" -> "8", "max_bin" -> "16",
@@ -179,8 +179,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("test with quantile histo depthwidth with max depth") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_leaves" -> "8", "max_depth" -> "2",
@@ -190,8 +190,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("test with quantile histo depthwidth with max depth and max bin") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
@@ -201,7 +201,7 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("test training from existing model in scala") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
     val paramMap = List("max_depth" -> "0", "silent" -> "0",
       "objective" -> "binary:logistic", "tree_method" -> "hist",
       "grow_policy" -> "depthwise", "max_depth" -> "2", "max_bin" -> "2",
@@ -213,8 +213,8 @@ class ScalaBoosterImplSuite extends AnyFunSuite {
   }
 
   test("test getting number of features from a booster") {
-    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train")
-    val testMat = new DMatrix("../../demo/data/agaricus.txt.test")
+    val trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm")
+    val testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm")
     val booster = trainBooster(trainMat, testMat)
 
     TestCase.assertEquals(booster.getNumFeature, 127)
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 5566e0b2d..026381fe1 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -882,5 +882,12 @@ def data_dir(path: str) -> str:
     return os.path.join(demo_dir(path), "data")
 
 
+def load_agaricus(path: str) -> Tuple[xgb.DMatrix, xgb.DMatrix]:
+    dpath = data_dir(path)
+    dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train?format=libsvm"))
+    dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test?format=libsvm"))
+    return dtrain, dtest
+
+
 def project_root(path: str) -> str:
     return normpath(os.path.join(demo_dir(path), os.path.pardir))
diff --git a/src/data/data.cc b/src/data/data.cc
index 236bd9131..1aedd6d92 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -819,8 +819,7 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
   return nullptr;
 }
 
-DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode,
-                       const std::string& file_format) {
+DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
   auto need_split = false;
   if (collective::IsFederated()) {
     LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers";
@@ -862,11 +861,9 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
   }
 
   // legacy handling of binary data loading
-  if (file_format == "auto") {
-    DMatrix* loaded = TryLoadBinary(fname, silent);
-    if (loaded) {
-      return loaded;
-    }
+  DMatrix* loaded = TryLoadBinary(fname, silent);
+  if (loaded) {
+    return loaded;
   }
 
   int partid = 0, npart = 1;
@@ -882,47 +879,24 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
   }
 
+  data::ValidateFileFormat(fname);
   DMatrix* dmat {nullptr};
-  try {
-    if (cache_file.empty()) {
-      std::unique_ptr<dmlc::Parser<uint32_t>> parser(
-          dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, file_format.c_str()));
-      data::FileAdapter adapter(parser.get());
-      dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
-                             cache_file, data_split_mode);
-    } else {
-      data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart),
-                              file_format};
-      dmat = new data::SparsePageDMatrix{&iter,
-                                         iter.Proxy(),
-                                         data::fileiter::Reset,
-                                         data::fileiter::Next,
-                                         std::numeric_limits<float>::quiet_NaN(),
-                                         1,
-                                         cache_file};
-    }
-  } catch (dmlc::Error& e) {
-    std::vector<std::string> splited = common::Split(fname, '#');
-    std::vector<std::string> args = common::Split(splited.front(), '?');
-    std::string format {file_format};
-    if (args.size() == 1 && file_format == "auto") {
-      auto extension = common::Split(args.front(), '.').back();
-      if (extension == "csv" || extension == "libsvm") {
-        format = extension;
-      }
-      if (format == extension) {
-        LOG(WARNING)
-            << "No format parameter is provided in input uri, but found file extension: "
-            << format << " .  "
-            << "Consider providing a uri parameter: filename?format=" << format;
-      } else {
-        LOG(WARNING)
-            << "No format parameter is provided in input uri.  "
-            << "Choosing default parser in dmlc-core.  "
-            << "Consider providing a uri parameter like: filename?format=csv";
-      }
-    }
-    LOG(FATAL) << "Encountered parser error:\n" << e.what();
+
+  if (cache_file.empty()) {
+    std::unique_ptr<dmlc::Parser<uint32_t>> parser(
+        dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
+    data::FileAdapter adapter(parser.get());
+    dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
+                           cache_file, data_split_mode);
+  } else {
+    data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
+    dmat = new data::SparsePageDMatrix{&iter,
+                                       iter.Proxy(),
+                                       data::fileiter::Reset,
+                                       data::fileiter::Next,
+                                       std::numeric_limits<float>::quiet_NaN(),
+                                       1,
+                                       cache_file};
   }
 
   if (need_split && data_split_mode == DataSplitMode::kCol) {
diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h
index 96f0e09d4..4d7239677 100644
--- a/src/data/file_iterator.h
+++ b/src/data/file_iterator.h
@@ -1,22 +1,50 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #ifndef XGBOOST_DATA_FILE_ITERATOR_H_
 #define XGBOOST_DATA_FILE_ITERATOR_H_
 
-#include <string>
+#include <map>
 #include <memory>
-#include <vector>
+#include <string>
 #include <utility>
+#include <vector>
 
+#include "array_interface.h"
 #include "dmlc/data.h"
 #include "xgboost/c_api.h"
 #include "xgboost/json.h"
 #include "xgboost/linalg.h"
-#include "array_interface.h"
 
 namespace xgboost {
 namespace data {
+inline void ValidateFileFormat(std::string const& uri) {
+  std::vector<std::string> name_cache = common::Split(uri, '#');
+  CHECK_LE(name_cache.size(), 2)
+      << "Only one `#` is allowed in file path for cachefile specification";
+
+  std::vector<std::string> name_args = common::Split(name_cache[0], '?');
+  CHECK_LE(name_args.size(), 2) << "only one `?` is allowed in file path.";
+
+  StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"};
+  CHECK_EQ(name_args.size(), 2) << msg;
+
+  std::map<std::string, std::string> args;
+  std::vector<std::string> arg_list = common::Split(name_args[1], '&');
+  for (size_t i = 0; i < arg_list.size(); ++i) {
+    std::istringstream is(arg_list[i]);
+    std::pair<std::string, std::string> kv;
+    CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
+                                           << " for key in arg " << i + 1;
+    CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
+                                       << " for value in arg " << i + 1;
+    args.insert(kv);
+  }
+  if (args.find("format") == args.cend()) {
+    LOG(FATAL) << msg;
+  }
+}
+
 /**
  * An iterator for implementing external memory support with file inputs.  Users of
  * external memory are encouraged to define their own file parsers/loaders so this one is
@@ -31,8 +59,6 @@ class FileIterator {
   uint32_t part_idx_;
   // Equals to total number of workers.
   uint32_t n_parts_;
-  // Format of the input file, like "libsvm".
-  std::string type_;
 
   DMatrixHandle proxy_;
 
@@ -45,10 +71,9 @@ class FileIterator {
   std::string indices_;
 
  public:
-  FileIterator(std::string uri, unsigned part_index, unsigned num_parts,
-               std::string type)
-      : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts},
-        type_{std::move(type)} {
+  FileIterator(std::string uri, unsigned part_index, unsigned num_parts)
+      : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts} {
+    ValidateFileFormat(uri_);
     XGProxyDMatrixCreate(&proxy_);
   }
   ~FileIterator() {
@@ -94,9 +119,7 @@ class FileIterator {
   auto Proxy() -> decltype(proxy_) { return proxy_; }
 
   void Reset() {
-    CHECK(!type_.empty());
-    parser_.reset(dmlc::Parser<uint32_t>::Create(uri_.c_str(), part_idx_,
-                                                 n_parts_, type_.c_str()));
+    parser_.reset(dmlc::Parser<uint32_t>::Create(uri_.c_str(), part_idx_, n_parts_, "auto"));
   }
 };
 
diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h
index ccfdbff52..b8de641ff 100644
--- a/tests/cpp/common/test_hist_util.h
+++ b/tests/cpp/common/test_hist_util.h
@@ -88,7 +88,8 @@ inline std::shared_ptr<DMatrix> GetExternalMemoryDMatrixFromData(
     fo << row_data.str() << "\n";
   }
   fo.close();
-  return std::shared_ptr<DMatrix>(DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"));
+  return std::shared_ptr<DMatrix>(
+      DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache"));
 }
 
 // Test that elements are approximately equally distributed among bins
diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc
index 31da2c1fa..bd8c4b9c2 100644
--- a/tests/cpp/data/test_file_iterator.cc
+++ b/tests/cpp/data/test_file_iterator.cc
@@ -29,16 +29,16 @@ TEST(FileIterator, Basic) {
   {
     auto zpath = tmpdir.path + "/0-based.svm";
     CreateBigTestData(zpath, 3 * 64, true);
-    zpath += "?indexing_mode=0";
-    FileIterator iter{zpath, 0, 1, "libsvm"};
+    zpath += "?indexing_mode=0&format=libsvm";
+    FileIterator iter{zpath, 0, 1};
     check_n_features(&iter);
   }
 
   {
     auto opath = tmpdir.path + "/1-based.svm";
     CreateBigTestData(opath, 3 * 64, false);
-    opath += "?indexing_mode=1";
-    FileIterator iter{opath, 0, 1, "libsvm"};
+    opath += "?indexing_mode=1&format=libsvm";
+    FileIterator iter{opath, 0, 1};
     check_n_features(&iter);
   }
 }
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index 895844180..5ebe1c6bd 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -157,8 +157,7 @@ TEST(MetaInfo, LoadQid) {
   dmlc::TemporaryDirectory tempdir;
   std::string tmp_file = tempdir.path + "/qid_test.libsvm";
   {
-    std::unique_ptr<dmlc::Stream> fs(
-      dmlc::Stream::Create(tmp_file.c_str(), "w"));
+    std::unique_ptr<dmlc::Stream> fs(dmlc::Stream::Create(tmp_file.c_str(), "w"));
     dmlc::ostream os(fs.get());
     os << R"qid(3 qid:1 1:1 2:1 3:0 4:0.2 5:0
                 2 qid:1 1:0 2:0 3:1 4:0.1 5:1
@@ -175,7 +174,7 @@ TEST(MetaInfo, LoadQid) {
     os.set_stream(nullptr);
   }
   std::unique_ptr<xgboost::DMatrix> dmat(
-    xgboost::DMatrix::Load(tmp_file, true, xgboost::DataSplitMode::kRow, "libsvm"));
+      xgboost::DMatrix::Load(tmp_file + "?format=libsvm", true, xgboost::DataSplitMode::kRow));
 
   const xgboost::MetaInfo& info = dmat->Info();
   const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index a37352626..3bdbf5403 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -17,11 +17,15 @@
 
 using namespace xgboost;  // NOLINT
 
+namespace {
+std::string UriSVM(std::string name) { return name + "?format=libsvm"; }
+}  // namespace
+
 TEST(SimpleDMatrix, MetaInfo) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
+  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
 
   // Test the metadata that was parsed
   EXPECT_EQ(dmat->Info().num_row_, 2);
@@ -37,7 +41,7 @@ TEST(SimpleDMatrix, RowAccess) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file, false);
+  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file), false);
 
   // Loop over the batches and count the records
   int64_t row_count = 0;
@@ -60,7 +64,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file);
+  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
 
   ASSERT_TRUE(dmat->SingleColBlock());
 
@@ -387,7 +391,7 @@ TEST(SimpleDMatrix, SaveLoadBinary) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file);
+  xgboost::DMatrix * dmat = xgboost::DMatrix::Load(UriSVM(tmp_file));
   data::SimpleDMatrix *simple_dmat = dynamic_cast<data::SimpleDMatrix*>(dmat);
 
   const std::string tmp_binfile = tempdir.path + "/csr_source.binary";
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 24dc40949..608c32947 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -16,14 +16,19 @@
 #include "../helpers.h"
 
 using namespace xgboost;  // NOLINT
+namespace {
+std::string UriSVM(std::string name, std::string cache) {
+  return name + "?format=libsvm" + "#" + cache + ".cache";
+}
+}  // namespace
 
 template <typename Page>
 void TestSparseDMatrixLoadFile() {
   dmlc::TemporaryDirectory tmpdir;
   auto opath = tmpdir.path + "/1-based.svm";
   CreateBigTestData(opath, 3 * 64, false);
-  opath += "?indexing_mode=1";
-  data::FileIterator iter{opath, 0, 1, "libsvm"};
+  opath += "?indexing_mode=1&format=libsvm";
+  data::FileIterator iter{opath, 0, 1};
   auto n_threads = 0;
   data::SparsePageDMatrix m{&iter,
                             iter.Proxy(),
@@ -112,15 +117,13 @@ TEST(SparsePageDMatrix, MetaInfo) {
   size_t constexpr kEntries = 24;
   CreateBigTestData(tmp_file, kEntries);
 
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache", false);
+  std::unique_ptr<DMatrix> dmat{xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file), false)};
 
   // Test the metadata that was parsed
   EXPECT_EQ(dmat->Info().num_row_, 8ul);
   EXPECT_EQ(dmat->Info().num_col_, 5ul);
   EXPECT_EQ(dmat->Info().num_nonzero_, kEntries);
   EXPECT_EQ(dmat->Info().labels.Size(), dmat->Info().num_row_);
-
-  delete dmat;
 }
 
 TEST(SparsePageDMatrix, RowAccess) {
@@ -139,7 +142,7 @@ TEST(SparsePageDMatrix, ColAccess) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
+  xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file));
 
   // Loop over the batches and assert the data is as expected
   size_t iter = 0;
@@ -231,7 +234,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
   std::string filename = tempdir.path + "/simple.libsvm";
   CreateBigTestData(filename, 1 << 16);
 
-  data::FileIterator iter(filename, 0, 1, "auto");
+  data::FileIterator iter(filename + "?format=libsvm", 0, 1);
   std::unique_ptr<DMatrix> sparse{
       new data::SparsePageDMatrix{&iter, iter.Proxy(), data::fileiter::Reset, data::fileiter::Next,
                                   std::numeric_limits<float>::quiet_NaN(), threads, filename}};
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index bb562ffb7..55a44e458 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -13,7 +13,7 @@ TEST(SparsePageDMatrix, EllpackPage) {
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
-  DMatrix* dmat = DMatrix::Load(tmp_file + "#" + tmp_file + ".cache");
+  DMatrix* dmat = DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache");
 
   // Loop over the batches and assert the data is as expected
   size_t n = 0;
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 76fd2f967..7c81b96f9 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -548,7 +548,7 @@ std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
   }
   fo.close();
 
-  std::string uri = tmp_file;
+  std::string uri = tmp_file + "?format=libsvm";
   if (page_size > 0) {
     uri += "#" + tmp_file + ".cache";
   }
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 91e8070c2..a3bb30fcd 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -126,7 +126,8 @@ TEST(Learner, SLOW_CheckMultiBatch) {  // NOLINT
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/big.libsvm";
   CreateBigTestData(tmp_file, 50000);
-  std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load(tmp_file + "#" + tmp_file + ".cache"));
+  std::shared_ptr<DMatrix> dmat(
+      xgboost::DMatrix::Load(tmp_file + "?format=libsvm" + "#" + tmp_file + ".cache"));
   EXPECT_FALSE(dmat->SingleColBlock());
   size_t num_row = dmat->Info().num_row_;
   std::vector<bst_float> labels(num_row);
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index fab2a6eca..e512e4bc6 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -21,8 +21,7 @@ class TestBasic:
         assert not lazy_isinstance(a, 'numpy', 'dataframe')
 
     def test_basic(self):
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain, dtest = tm.load_agaricus(__file__)
         param = {'max_depth': 2, 'eta': 1,
                  'objective': 'binary:logistic'}
         # specify validations set to watch performance
@@ -61,8 +60,7 @@ class TestBasic:
     def test_metric_config(self):
         # Make sure that the metric configuration happens in booster so the
         # string `['error', 'auc']` doesn't get passed down to core.
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain, dtest = tm.load_agaricus(__file__)
         param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                  'objective': 'binary:logistic', 'eval_metric': ['error', 'auc']}
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
@@ -78,8 +76,7 @@ class TestBasic:
             np.testing.assert_allclose(predt_0, predt_1)
 
     def test_multiclass(self):
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain, dtest = tm.load_agaricus(__file__)
         param = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'num_class': 2}
         # specify validations set to watch performance
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
@@ -188,7 +185,7 @@ class TestBasic:
             assert dm.num_col() == cols
 
     def test_cv(self):
-        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dm, _ = tm.load_agaricus(__file__)
         params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                   'objective': 'binary:logistic'}
 
@@ -198,7 +195,7 @@ class TestBasic:
         assert len(cv) == (4)
 
     def test_cv_no_shuffle(self):
-        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dm, _ = tm.load_agaricus(__file__)
         params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                   'objective': 'binary:logistic'}
 
@@ -209,7 +206,7 @@ class TestBasic:
         assert len(cv) == (4)
 
     def test_cv_explicit_fold_indices(self):
-        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dm, _ = tm.load_agaricus(__file__)
         params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective':
                   'binary:logistic'}
         folds = [
@@ -268,8 +265,7 @@ class TestBasicPathLike:
 
     def test_DMatrix_init_from_path(self):
         """Initialization from the data path."""
-        dpath = Path('demo/data')
-        dtrain = xgb.DMatrix(dpath / 'agaricus.txt.train')
+        dtrain, _ = tm.load_agaricus(__file__)
         assert dtrain.num_row() == 6513
         assert dtrain.num_col() == 127
 
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index d76205593..610a9236e 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -42,8 +42,7 @@ class TestModels:
         param = {'verbosity': 0, 'objective': 'binary:logistic',
                  'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1,
                  'nthread': 1}
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 4
         bst = xgb.train(param, dtrain, num_round, watchlist)
@@ -55,8 +54,7 @@ class TestModels:
         assert err < 0.2
 
     def test_dart(self):
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         param = {'max_depth': 5, 'objective': 'binary:logistic',
                  'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1}
         # specify validations set to watch performance
@@ -122,7 +120,7 @@ class TestModels:
 
     def test_boost_from_prediction(self):
         # Re-construct dtrain here to avoid modification
-        margined = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        margined, _ = tm.load_agaricus(__file__)
         bst = xgb.train({'tree_method': 'hist'}, margined, 1)
         predt_0 = bst.predict(margined, output_margin=True)
         margined.set_base_margin(predt_0)
@@ -130,13 +128,13 @@ class TestModels:
         predt_1 = bst.predict(margined)
 
         assert np.any(np.abs(predt_1 - predt_0) > 1e-6)
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        dtrain, _ = tm.load_agaricus(__file__)
         bst = xgb.train({'tree_method': 'hist'}, dtrain, 2)
         predt_2 = bst.predict(dtrain)
         assert np.all(np.abs(predt_2 - predt_1) < 1e-6)
 
     def test_boost_from_existing_model(self):
-        X = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        X, _ = tm.load_agaricus(__file__)
         booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4)
         assert booster.num_boosted_rounds() == 4
         booster = xgb.train({'tree_method': 'hist'}, X, num_boost_round=4,
@@ -156,8 +154,7 @@ class TestModels:
             'objective': 'reg:logistic',
             "tree_method": tree_method
         }
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 10
 
@@ -203,8 +200,7 @@ class TestModels:
         self.run_custom_objective()
 
     def test_multi_eval_metric(self):
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1,
                  'objective': 'binary:logistic'}
@@ -226,7 +222,7 @@ class TestModels:
             param['scale_pos_weight'] = ratio
             return (dtrain, dtest, param)
 
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        dtrain, _ = tm.load_agaricus(__file__)
         xgb.cv(param, dtrain, num_round, nfold=5,
                metrics={'auc'}, seed=0, fpreproc=fpreproc)
 
@@ -234,7 +230,7 @@ class TestModels:
         param = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                  'objective': 'binary:logistic'}
         num_round = 2
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        dtrain, _ = tm.load_agaricus(__file__)
         xgb.cv(param, dtrain, num_round, nfold=5,
                metrics={'error'}, seed=0, show_stdv=False)
 
@@ -392,7 +388,7 @@ class TestModels:
         os.remove(model_path)
 
         try:
-            dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+            dtrain, _ = tm.load_agaricus(__file__)
             xgb.train({'objective': 'foo'}, dtrain, num_boost_round=1)
         except ValueError as e:
             e_str = str(e)
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index e8375aa5e..d3ec05e6e 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -275,9 +275,7 @@ class TestCallbacks:
         """Test learning rate scheduler, used by both CPU and GPU tests."""
         scheduler = xgb.callback.LearningRateScheduler
 
-        dpath = tm.data_dir(__file__)
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
 
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 4
@@ -361,9 +359,7 @@ class TestCallbacks:
         num_round = 4
         scheduler = xgb.callback.LearningRateScheduler
 
-        dpath = tm.data_dir(__file__)
-        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
-        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        dtrain, dtest = tm.load_agaricus(__file__)
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
 
         param = {
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index ef56ff656..bcc089afb 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -283,7 +283,7 @@ class TestDMatrix:
             assert m0.feature_types == m1.feature_types
 
     def test_get_info(self):
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dtrain, _ = tm.load_agaricus(__file__)
         dtrain.get_float_info('label')
         dtrain.get_float_info('weight')
         dtrain.get_float_info('base_margin')
@@ -432,7 +432,9 @@ class TestDMatrix:
     def test_uri_categorical(self):
         path = os.path.join(dpath, 'agaricus.txt.train')
         feature_types = ["q"] * 5 + ["c"] + ["q"] * 120
-        Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types)
+        Xy = xgb.DMatrix(
+            path + "?indexing_mode=1&format=libsvm", feature_types=feature_types
+        )
         np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types))
 
     def test_base_margin(self):
diff --git a/tests/python/test_interaction_constraints.py b/tests/python/test_interaction_constraints.py
index 96d2ba7dc..5eaaf1f8c 100644
--- a/tests/python/test_interaction_constraints.py
+++ b/tests/python/test_interaction_constraints.py
@@ -88,8 +88,12 @@ class TestInteractionConstraints:
     def training_accuracy(self, tree_method):
         """Test accuracy, reused by GPU tests."""
         from sklearn.metrics import accuracy_score
-        dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
-        dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
+        dtrain = xgboost.DMatrix(
+            dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm"
+        )
+        dtest = xgboost.DMatrix(
+            dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm"
+        )
         params = {
             'eta': 1,
             'max_depth': 6,
diff --git a/tests/python/test_monotone_constraints.py b/tests/python/test_monotone_constraints.py
index 4dbfaa60d..a3785f1cb 100644
--- a/tests/python/test_monotone_constraints.py
+++ b/tests/python/test_monotone_constraints.py
@@ -134,8 +134,8 @@ class TestMonotoneConstraints:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_training_accuracy(self):
         from sklearn.metrics import accuracy_score
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
+        dtrain = xgb.DMatrix(dpath + "agaricus.txt.train?indexing_mode=1&format=libsvm")
+        dtest = xgb.DMatrix(dpath + "agaricus.txt.test?indexing_mode=1&format=libsvm")
         params = {'eta': 1, 'max_depth': 6, 'objective': 'binary:logistic',
                   'tree_method': 'hist', 'monotone_constraints': '(1, 0)'}
         num_boost_round = 5
diff --git a/tests/python/test_openmp.py b/tests/python/test_openmp.py
index c53363736..82b0ba270 100644
--- a/tests/python/test_openmp.py
+++ b/tests/python/test_openmp.py
@@ -13,9 +13,7 @@ pytestmark = tm.timeout(10)
 
 class TestOMP:
     def test_omp(self):
-        dpath = 'demo/data/'
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain, dtest = tm.load_agaricus(__file__)
 
         param = {'booster': 'gbtree',
                  'objective': 'binary:logistic',
diff --git a/tests/python/test_parse_tree.py b/tests/python/test_parse_tree.py
index 885c0f1e2..9d80d0f6f 100644
--- a/tests/python/test_parse_tree.py
+++ b/tests/python/test_parse_tree.py
@@ -13,7 +13,7 @@ rng = np.random.RandomState(1994)
 
 class TestTreesToDataFrame:
     def build_model(self, max_depth, num_round):
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dtrain, _ = tm.load_agaricus(__file__)
         param = {'max_depth': max_depth, 'objective': 'binary:logistic',
                  'verbosity': 1}
         num_round = num_round
diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py
index dc45cd254..303c7c8c1 100644
--- a/tests/python/test_plotting.py
+++ b/tests/python/test_plotting.py
@@ -17,12 +17,10 @@ except ImportError:
 pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(),
                                                  tm.no_graphviz()))
 
-dpath = 'demo/data/agaricus.txt.train'
-
 
 class TestPlotting:
     def test_plotting(self):
-        m = xgb.DMatrix(dpath)
+        m, _ = tm.load_agaricus(__file__)
         booster = xgb.train({'max_depth': 2, 'eta': 1,
                              'objective': 'binary:logistic'}, m,
                             num_boost_round=2)
diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py
index 4d861ad6e..2585da088 100644
--- a/tests/python/test_shap.py
+++ b/tests/python/test_shap.py
@@ -46,8 +46,8 @@ class TestSHAP:
         fscores = bst.get_fscore()
         assert scores1 == fscores
 
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?format=libsvm')
+        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?format=libsvm')
 
         def fn(max_depth, num_rounds):
             # train
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index dd710f6a4..78097a4ea 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -154,9 +154,7 @@ class TestTreeMethod:
 
     def test_hist_categorical(self):
         # hist must be same as exact on all-categorial data
-        dpath = 'demo/data/'
-        ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
-        ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
+        ag_dtrain, ag_dtest = tm.load_agaricus(__file__)
         ag_param = {'max_depth': 2,
                     'tree_method': 'hist',
                     'eta': 1,
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index 07295eb6c..f8a21b6ab 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -222,7 +222,7 @@ class TestPandas:
         set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist")
 
     def test_cv_as_pandas(self):
-        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
+        dm, _ = tm.load_agaricus(__file__)
         params = {'max_depth': 2, 'eta': 1, 'verbosity': 0,
                   'objective': 'binary:logistic', 'eval_metric': 'error'}
 

From 08ce495b5de973033160e7c7b650abf59346a984 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 28 Apr 2023 21:47:14 +0800
Subject: [PATCH 34/34] Use Booster context in DMatrix. (#8896)

- Pass context from booster to DMatrix.
- Use context instead of integer for `n_threads`.
- Check the consistency configuration for `max_bin`.
- Test for all combinations of initialization options.
---
 include/xgboost/context.h                     |  12 ++
 include/xgboost/data.h                        | 138 +++++++++-----
 python-package/xgboost/testing/__init__.py    |   6 +-
 src/c_api/c_api.cc                            |  72 ++++---
 src/common/error_msg.h                        |   5 +
 src/common/hist_util.cc                       |  27 ++-
 src/common/hist_util.h                        |   2 +-
 src/common/quantile.cc                        |  44 ++---
 src/common/quantile.h                         |  17 +-
 src/data/batch_utils.h                        |  33 ++++
 src/data/ellpack_page.cc                      |   6 +-
 src/data/ellpack_page.cu                      |  16 +-
 src/data/ellpack_page.cuh                     |   2 +-
 src/data/ellpack_page_source.cu               |   9 +-
 src/data/ellpack_page_source.h                |  10 +-
 src/data/gradient_index.cc                    |  16 +-
 src/data/gradient_index.h                     |  28 +--
 src/data/iterative_dmatrix.cc                 | 164 ++++++++++------
 src/data/iterative_dmatrix.cu                 |  69 ++++---
 src/data/iterative_dmatrix.h                  |  69 +++----
 src/data/proxy_dmatrix.h                      |  57 +++---
 src/data/simple_dmatrix.cc                    | 176 +++++++++++-------
 src/data/simple_dmatrix.cu                    |  20 +-
 src/data/simple_dmatrix.h                     |  17 +-
 src/data/sparse_page_dmatrix.cc               |  75 ++++----
 src/data/sparse_page_dmatrix.cu               |  39 ++--
 src/data/sparse_page_dmatrix.h                |  43 +++--
 src/linear/coordinate_common.h                | 109 +++++------
 src/linear/updater_coordinate.cc              |  27 ++-
 src/linear/updater_gpu_coordinate.cu          |  15 +-
 src/linear/updater_shotgun.cc                 |  17 +-
 src/predictor/cpu_predictor.cc                |   2 +-
 src/predictor/gpu_predictor.cu                |   4 +-
 src/tree/gpu_hist/gradient_based_sampler.cu   |  88 ++++-----
 src/tree/gpu_hist/gradient_based_sampler.cuh  |  47 +++--
 src/tree/updater_approx.cc                    |   9 +-
 src/tree/updater_colmaker.cc                  |   6 +-
 src/tree/updater_gpu_hist.cu                  |  11 +-
 src/tree/updater_quantile_hist.cc             |  26 +--
 tests/cpp/common/test_column_matrix.cc        |   9 +-
 tests/cpp/common/test_hist_util.cc            |  45 +++--
 tests/cpp/common/test_hist_util.cu            |  11 +-
 tests/cpp/common/test_quantile.cc             |  43 +++--
 tests/cpp/data/test_ellpack_page.cu           |  54 +++---
 .../cpp/data/test_ellpack_page_raw_format.cu  |  14 +-
 tests/cpp/data/test_gradient_index.cc         |  53 ++++--
 .../test_gradient_index_page_raw_format.cc    |  14 +-
 tests/cpp/data/test_iterative_dmatrix.cc      |   3 +-
 tests/cpp/data/test_iterative_dmatrix.cu      |  28 ++-
 tests/cpp/data/test_iterative_dmatrix.h       |  23 ++-
 tests/cpp/data/test_simple_dmatrix.cc         |   3 +-
 tests/cpp/data/test_sparse_page_dmatrix.cc    |  30 +--
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  39 ++--
 tests/cpp/data/test_sparse_page_raw_format.cc |  21 ++-
 tests/cpp/helpers.h                           |   5 +
 tests/cpp/test_serialization.cc               |   6 +-
 .../gpu_hist/test_gradient_based_sampler.cu   |  27 +--
 tests/cpp/tree/gpu_hist/test_histogram.cu     |  19 +-
 tests/cpp/tree/hist/test_evaluate_splits.cc   |   4 +-
 tests/cpp/tree/hist/test_histogram.cc         |  76 ++++----
 tests/cpp/tree/test_approx.cc                 |   6 +-
 tests/cpp/tree/test_common_partitioner.cc     |   2 +-
 tests/cpp/tree/test_gpu_hist.cu               |  41 ++--
 tests/cpp/tree/test_quantile_hist.cc          |   2 +-
 tests/cpp/tree/test_regen.cc                  |  15 +-
 .../test_device_quantile_dmatrix.py           |  82 +++++++-
 tests/python-gpu/test_gpu_updaters.py         |  10 +-
 67 files changed, 1283 insertions(+), 935 deletions(-)
 create mode 100644 src/data/batch_utils.h

diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index aaa1e3eb8..f1cd391df 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -50,7 +50,19 @@ struct Context : public XGBoostParameter<Context> {
 
   bool IsCPU() const { return gpu_id == kCpuId; }
   bool IsCUDA() const { return !IsCPU(); }
+
   CUDAContext const* CUDACtx() const;
+  // Make a CUDA context based on the current context.
+  Context MakeCUDA(std::int32_t device = 0) const {
+    Context ctx = *this;
+    ctx.gpu_id = device;
+    return ctx;
+  }
+  Context MakeCPU() const {
+    Context ctx = *this;
+    ctx.gpu_id = kCpuId;
+    return ctx;
+  }
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(Context) {
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 3f7b6ad85..6305abff8 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2023 by XGBoost Contributors
  * \file data.h
  * \brief The input data structure of xgboost.
  * \author Tianqi Chen
@@ -238,44 +238,72 @@ struct Entry {
   }
 };
 
-/*!
- * \brief Parameters for constructing batches.
+/**
+ * \brief Parameters for constructing histogram index batches.
  */
 struct BatchParam {
-  /*! \brief The GPU device to use. */
-  int gpu_id {-1};
-  /*! \brief Maximum number of bins per feature for histograms. */
+  /**
+   * \brief Maximum number of bins per feature for histograms.
+   */
   bst_bin_t max_bin{0};
-  /*! \brief Hessian, used for sketching with future approx implementation. */
+  /**
+   * \brief Hessian, used for sketching with future approx implementation.
+   */
   common::Span<float> hess;
-  /*! \brief Whether should DMatrix regenerate the batch.  Only used for GHistIndex. */
-  bool regen {false};
-  /*! \brief Parameter used to generate column matrix for hist. */
+  /**
+   * \brief Whether should we force DMatrix to regenerate the batch.  Only used for
+   *        GHistIndex.
+   */
+  bool regen{false};
+  /**
+   * \brief Forbid regenerating the gradient index. Used for internal validation.
+   */
+  bool forbid_regen{false};
+  /**
+   * \brief Parameter used to generate column matrix for hist.
+   */
   double sparse_thresh{std::numeric_limits<double>::quiet_NaN()};
 
+  /**
+   * \brief Exact or others that don't need histogram.
+   */
   BatchParam() = default;
-  // GPU Hist
-  BatchParam(int32_t device, bst_bin_t max_bin)
-      : gpu_id{device}, max_bin{max_bin} {}
-  // Hist
+  /**
+   * \brief Used by the hist tree method.
+   */
   BatchParam(bst_bin_t max_bin, double sparse_thresh)
       : max_bin{max_bin}, sparse_thresh{sparse_thresh} {}
-  // Approx
   /**
-   * \brief Get batch with sketch weighted by hessian.  The batch will be regenerated if
-   *        the span is changed, so caller should keep the span for each iteration.
+   * \brief Used by the approx tree method.
+   *
+   *   Get batch with sketch weighted by hessian.  The batch will be regenerated if the
+   *   span is changed, so caller should keep the span for each iteration.
    */
   BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
       : max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
 
-  bool operator!=(BatchParam const& other) const {
-    if (hess.empty() && other.hess.empty()) {
-      return gpu_id != other.gpu_id || max_bin != other.max_bin;
-    }
-    return gpu_id != other.gpu_id || max_bin != other.max_bin || hess.data() != other.hess.data();
+  bool ParamNotEqual(BatchParam const& other) const {
+    // Check non-floating parameters.
+    bool cond = max_bin != other.max_bin;
+    // Check sparse thresh.
+    bool l_nan = std::isnan(sparse_thresh);
+    bool r_nan = std::isnan(other.sparse_thresh);
+    bool st_chg = (l_nan != r_nan) || (!l_nan && !r_nan && (sparse_thresh != other.sparse_thresh));
+    cond |= st_chg;
+
+    return cond;
   }
-  bool operator==(BatchParam const& other) const {
-    return !(*this != other);
+  bool Initialized() const { return max_bin != 0; }
+  /**
+   * \brief Make a copy of self for DMatrix to describe how its existing index was generated.
+   */
+  BatchParam MakeCache() const {
+    auto p = *this;
+    // These parameters have nothing to do with how the gradient index was generated in the
+    // first place.
+    p.regen = false;
+    p.forbid_regen = false;
+    return p;
   }
 };
 
@@ -435,7 +463,7 @@ class EllpackPage {
    * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
    * in CSR format.
    */
-  explicit EllpackPage(DMatrix* dmat, const BatchParam& param);
+  explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
 
   /*! \brief Destructor. */
   ~EllpackPage();
@@ -551,7 +579,9 @@ class DMatrix {
   template <typename T>
   BatchSet<T> GetBatches();
   template <typename T>
-  BatchSet<T> GetBatches(const BatchParam& param);
+  BatchSet<T> GetBatches(Context const* ctx);
+  template <typename T>
+  BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
   template <typename T>
   bool PageExists() const;
 
@@ -658,18 +688,19 @@ class DMatrix {
 
  protected:
   virtual BatchSet<SparsePage> GetRowBatches() = 0;
-  virtual BatchSet<CSCPage> GetColumnBatches() = 0;
-  virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
-  virtual BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) = 0;
-  virtual BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) = 0;
-  virtual BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) = 0;
+  virtual BatchSet<CSCPage> GetColumnBatches(Context const* ctx) = 0;
+  virtual BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const* ctx) = 0;
+  virtual BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, BatchParam const& param) = 0;
+  virtual BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx,
+                                                      BatchParam const& param) = 0;
+  virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;
 
   virtual bool EllpackExists() const = 0;
   virtual bool GHistIndexExists() const = 0;
   virtual bool SparsePageExists() const = 0;
 };
 
-template<>
+template <>
 inline BatchSet<SparsePage> DMatrix::GetBatches() {
   return GetRowBatches();
 }
@@ -684,34 +715,39 @@ inline bool DMatrix::PageExists<GHistIndexMatrix>() const {
   return this->GHistIndexExists();
 }
 
-template<>
+template <>
 inline bool DMatrix::PageExists<SparsePage>() const {
   return this->SparsePageExists();
 }
 
-template<>
-inline BatchSet<CSCPage> DMatrix::GetBatches() {
-  return GetColumnBatches();
-}
-
-template<>
-inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
-  return GetSortedColumnBatches();
-}
-
-template<>
-inline BatchSet<EllpackPage> DMatrix::GetBatches(const BatchParam& param) {
-  return GetEllpackBatches(param);
+template <>
+inline BatchSet<SparsePage> DMatrix::GetBatches(Context const*) {
+  return GetRowBatches();
 }
 
 template <>
-inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(const BatchParam& param) {
-  return GetGradientIndex(param);
+inline BatchSet<CSCPage> DMatrix::GetBatches(Context const* ctx) {
+  return GetColumnBatches(ctx);
 }
 
 template <>
-inline BatchSet<ExtSparsePage> DMatrix::GetBatches() {
-  return GetExtBatches(BatchParam{});
+inline BatchSet<SortedCSCPage> DMatrix::GetBatches(Context const* ctx) {
+  return GetSortedColumnBatches(ctx);
+}
+
+template <>
+inline BatchSet<EllpackPage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetEllpackBatches(ctx, param);
+}
+
+template <>
+inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetGradientIndex(ctx, param);
+}
+
+template <>
+inline BatchSet<ExtSparsePage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetExtBatches(ctx, param);
 }
 }  // namespace xgboost
 
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 026381fe1..7bf3cf45b 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -317,13 +317,15 @@ class TestDataset:
             enable_categorical=True,
         )
 
-    def get_device_dmat(self) -> xgb.QuantileDMatrix:
+    def get_device_dmat(self, max_bin: Optional[int]) -> xgb.QuantileDMatrix:
         import cupy as cp
 
         w = None if self.w is None else cp.array(self.w)
         X = cp.array(self.X, dtype=np.float32)
         y = cp.array(self.y, dtype=np.float32)
-        return xgb.QuantileDMatrix(X, y, weight=w, base_margin=self.margin)
+        return xgb.QuantileDMatrix(
+            X, y, weight=w, base_margin=self.margin, max_bin=max_bin
+        )
 
     def get_external_dmat(self) -> xgb.DMatrix:
         n_samples = self.X.shape[0]
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 59cb429da..b35879fd7 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -3,30 +3,50 @@
  */
 #include "xgboost/c_api.h"
 
-#include <rabit/c_api.h>
+#include <algorithm>                         // for copy
+#include <cinttypes>                         // for strtoimax
+#include <cmath>                             // for nan
+#include <cstring>                           // for strcmp
+#include <fstream>                           // for operator<<, basic_ostream, ios, stringstream
+#include <functional>                        // for less
+#include <limits>                            // for numeric_limits
+#include <map>                               // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
+#include <memory>                            // for shared_ptr, allocator, __shared_ptr_access
+#include <string>                            // for char_traits, basic_string, operator==, string
+#include <system_error>                      // for errc
+#include <utility>                           // for pair
+#include <vector>                            // for vector
 
-#include <cstring>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "../collective/communicator-inl.h"
-#include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
-#include "../common/charconv.h"
-#include "../common/io.h"
-#include "../data/adapter.h"
-#include "../data/simple_dmatrix.h"
-#include "c_api_utils.h"
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/global_config.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/json.h"
-#include "xgboost/learner.h"
-#include "xgboost/logging.h"
-#include "xgboost/string_view.h"  // StringView
-#include "xgboost/version_config.h"
+#include "../collective/communicator-inl.h"  // for Allreduce, Broadcast, Finalize, GetProcessor...
+#include "../common/api_entry.h"             // for XGBAPIThreadLocalEntry
+#include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
+#include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
+#include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
+#include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
+#include "../data/proxy_dmatrix.h"           // for DMatrixProxy
+#include "../data/simple_dmatrix.h"          // for SimpleDMatrix
+#include "c_api_error.h"                     // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
+#include "c_api_utils.h"                     // for RequiredArg, OptionalArg, GetMissing, CastDM...
+#include "dmlc/base.h"                       // for BeginPtr, DMLC_ATTRIBUTE_UNUSED
+#include "dmlc/io.h"                         // for Stream
+#include "dmlc/parameter.h"                  // for FieldAccessEntry, FieldEntry, ParamManager
+#include "dmlc/thread_local.h"               // for ThreadLocalStore
+#include "rabit/c_api.h"                     // for RabitLinkTag
+#include "rabit/rabit.h"                     // for CheckPoint, LoadCheckPoint
+#include "xgboost/base.h"                    // for bst_ulong, bst_float, GradientPair, bst_feat...
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for DMatrix, MetaInfo, DataType, ExtSparsePage
+#include "xgboost/feature_map.h"             // for FeatureMap
+#include "xgboost/global_config.h"           // for GlobalConfiguration, GlobalConfigThreadLocal...
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/intrusive_ptr.h"           // for xgboost
+#include "xgboost/json.h"                    // for Json, get, Integer, IsA, Boolean, String
+#include "xgboost/learner.h"                 // for Learner, PredictionType
+#include "xgboost/logging.h"                 // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
+#include "xgboost/predictor.h"               // for PredictionCacheEntry
+#include "xgboost/span.h"                    // for Span
+#include "xgboost/string_view.h"             // for StringView, operator<<
+#include "xgboost/version_config.h"          // for XGBOOST_VER_MAJOR, XGBOOST_VER_MINOR, XGBOOS...
 
 #if defined(XGBOOST_USE_FEDERATED)
 #include "../../plugin/federated/federated_server.h"
@@ -341,10 +361,10 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
   API_END();
 }
 
-XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle* out) {
+XGB_DLL int XGProxyDMatrixCreate(DMatrixHandle *out) {
   API_BEGIN();
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);;
+  *out = new std::shared_ptr<xgboost::DMatrix>(new xgboost::data::DMatrixProxy);
   API_END();
 }
 
@@ -746,7 +766,7 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
 
   CHECK_LE(p_m->Info().num_col_, std::numeric_limits<unsigned>::max());
 
-  for (auto const &page : p_m->GetBatches<ExtSparsePage>()) {
+  for (auto const &page : p_m->GetBatches<ExtSparsePage>(p_m->Ctx(), BatchParam{})) {
     CHECK(page.page);
     auto const &h_offset = page.page->offset.ConstHostVector();
     std::copy(h_offset.cbegin(), h_offset.cend(), out_indptr);
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 4415bf2ee..3f57a63a3 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -28,5 +28,10 @@ constexpr StringView InfInData() {
 constexpr StringView NoF128() {
   return "128-bit floating point is not supported on current platform.";
 }
+
+constexpr StringView InconsistentMaxBin() {
+  return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
+         "and consistent with the Booster being trained.";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index f97003d1d..c9b50792d 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -2,15 +2,18 @@
  * Copyright 2017-2023 by XGBoost Contributors
  * \file hist_util.cc
  */
+#include "hist_util.h"
+
 #include <dmlc/timer.h>
 
 #include <vector>
 
-#include "xgboost/base.h"
 #include "../common/common.h"
-#include "hist_util.h"
 #include "column_matrix.h"
 #include "quantile.h"
+#include "xgboost/base.h"
+#include "xgboost/context.h"  // Context
+#include "xgboost/data.h"     // SparsePage, SortedCSCPage
 
 #if defined(XGBOOST_MM_PREFETCH_PRESENT)
   #include <xmmintrin.h>
@@ -28,10 +31,11 @@ HistogramCuts::HistogramCuts() {
   cut_ptrs_.HostVector().emplace_back(0);
 }
 
-HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, bool use_sorted,
+HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
                               Span<float> const hessian) {
   HistogramCuts out;
-  auto const& info = m->Info();
+  auto const &info = m->Info();
+  auto n_threads = ctx->Threads();
   std::vector<bst_row_t> reduced(info.num_col_, 0);
   for (auto const &page : m->GetBatches<SparsePage>()) {
     auto const &entries_per_column =
@@ -44,16 +48,19 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
   }
 
   if (!use_sorted) {
-    HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
-                                  HostSketchContainer::UseGroup(info), n_threads);
-    for (auto const& page : m->GetBatches<SparsePage>()) {
+    HostSketchContainer container(ctx, max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
+                                  HostSketchContainer::UseGroup(info));
+    for (auto const &page : m->GetBatches<SparsePage>()) {
       container.PushRowPage(page, info, hessian);
     }
     container.MakeCuts(m->Info(), &out);
   } else {
-    SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
-                                    HostSketchContainer::UseGroup(info), n_threads};
-    for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+    SortedSketchContainer container{ctx,
+                                    max_bins,
+                                    m->Info().feature_types.ConstHostSpan(),
+                                    reduced,
+                                    HostSketchContainer::UseGroup(info)};
+    for (auto const &page : m->GetBatches<SortedCSCPage>(ctx)) {
       container.PushColPage(page, info, hessian);
     }
     container.MakeCuts(m->Info(), &out);
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index d95d405eb..6380952d7 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -170,7 +170,7 @@ class HistogramCuts {
  * \param use_sorted Whether should we use SortedCSC for sketching, it's more efficient
  *                   but consumes more memory.
  */
-HistogramCuts SketchOnDMatrix(DMatrix* m, int32_t max_bins, int32_t n_threads,
+HistogramCuts SketchOnDMatrix(Context const* ctx, DMatrix* m, bst_bin_t max_bins,
                               bool use_sorted = false, Span<float> const hessian = {});
 
 enum BinTypeSize : uint8_t {
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 60626052c..a93184b95 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -16,16 +16,16 @@ namespace xgboost {
 namespace common {
 
 template <typename WQSketch>
-SketchContainerImpl<WQSketch>::SketchContainerImpl(std::vector<bst_row_t> columns_size,
+SketchContainerImpl<WQSketch>::SketchContainerImpl(Context const *ctx,
+                                                   std::vector<bst_row_t> columns_size,
                                                    int32_t max_bins,
                                                    Span<FeatureType const> feature_types,
-                                                   bool use_group,
-                                                   int32_t n_threads)
+                                                   bool use_group)
     : feature_types_(feature_types.cbegin(), feature_types.cend()),
       columns_size_{std::move(columns_size)},
       max_bins_{max_bins},
       use_group_ind_{use_group},
-      n_threads_{n_threads} {
+      n_threads_{ctx->Threads()} {
   monitor_.Init(__func__);
   CHECK_NE(columns_size_.size(), 0);
   sketches_.resize(columns_size_.size());
@@ -380,13 +380,13 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const& info, HistogramCuts* cuts) {
+void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const &info, HistogramCuts *p_cuts) {
   monitor_.Start(__func__);
   std::vector<typename WQSketch::SummaryContainer> reduced;
   std::vector<int32_t> num_cuts;
   this->AllReduce(info, &reduced, &num_cuts);
 
-  cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
+  p_cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
   std::vector<typename WQSketch::SummaryContainer> final_summaries(reduced.size());
 
   ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) {
@@ -401,48 +401,48 @@ void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const& info, HistogramCuts
       a.SetPrune(reduced[fidx], max_num_bins + 1);
       CHECK(a.data && reduced[fidx].data);
       const bst_float mval = a.data[0].value;
-      cuts->min_vals_.HostVector()[fidx] = mval - fabs(mval) - 1e-5f;
+      p_cuts->min_vals_.HostVector()[fidx] = mval - fabs(mval) - 1e-5f;
     } else {
       // Empty column.
       const float mval = 1e-5f;
-      cuts->min_vals_.HostVector()[fidx] = mval;
+      p_cuts->min_vals_.HostVector()[fidx] = mval;
     }
   });
 
   float max_cat{-1.f};
   for (size_t fid = 0; fid < reduced.size(); ++fid) {
     size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
-    typename WQSketch::SummaryContainer const& a = final_summaries[fid];
+    typename WQSketch::SummaryContainer const &a = final_summaries[fid];
     if (IsCat(feature_types_, fid)) {
-      max_cat = std::max(max_cat, AddCategories(categories_.at(fid), cuts));
+      max_cat = std::max(max_cat, AddCategories(categories_.at(fid), p_cuts));
     } else {
-      AddCutPoint<WQSketch>(a, max_num_bins, cuts);
+      AddCutPoint<WQSketch>(a, max_num_bins, p_cuts);
       // push a value that is greater than anything
       const bst_float cpt =
-          (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
+          (a.size > 0) ? a.data[a.size - 1].value : p_cuts->min_vals_.HostVector()[fid];
       // this must be bigger than last value in a scale
       const bst_float last = cpt + (fabs(cpt) + 1e-5f);
-      cuts->cut_values_.HostVector().push_back(last);
+      p_cuts->cut_values_.HostVector().push_back(last);
     }
 
     // Ensure that every feature gets at least one quantile point
-    CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
-    auto cut_size = static_cast<uint32_t>(cuts->cut_values_.HostVector().size());
-    CHECK_GT(cut_size, cuts->cut_ptrs_.HostVector().back());
-    cuts->cut_ptrs_.HostVector().push_back(cut_size);
+    CHECK_LE(p_cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
+    auto cut_size = static_cast<uint32_t>(p_cuts->cut_values_.HostVector().size());
+    CHECK_GT(cut_size, p_cuts->cut_ptrs_.HostVector().back());
+    p_cuts->cut_ptrs_.HostVector().push_back(cut_size);
   }
 
-  cuts->SetCategorical(this->has_categorical_, max_cat);
+  p_cuts->SetCategorical(this->has_categorical_, max_cat);
   monitor_.Stop(__func__);
 }
 
 template class SketchContainerImpl<WQuantileSketch<float, float>>;
 template class SketchContainerImpl<WXQuantileSketch<float, float>>;
 
-HostSketchContainer::HostSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
-                                         std::vector<size_t> columns_size, bool use_group,
-                                         int32_t n_threads)
-    : SketchContainerImpl{columns_size, max_bins, ft, use_group, n_threads} {
+HostSketchContainer::HostSketchContainer(Context const *ctx, bst_bin_t max_bins,
+                                         common::Span<FeatureType const> ft,
+                                         std::vector<size_t> columns_size, bool use_group)
+    : SketchContainerImpl{ctx, columns_size, max_bins, ft, use_group} {
   monitor_.Init(__func__);
   ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
     auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
diff --git a/src/common/quantile.h b/src/common/quantile.h
index f8d347112..0a82f7c90 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -800,9 +800,8 @@ class SketchContainerImpl {
    * \param max_bins maximum number of bins for each feature.
    * \param use_group whether is assigned to group to data instance.
    */
-  SketchContainerImpl(std::vector<bst_row_t> columns_size, int32_t max_bins,
-                      common::Span<FeatureType const> feature_types, bool use_group,
-                      int32_t n_threads);
+  SketchContainerImpl(Context const *ctx, std::vector<bst_row_t> columns_size, int32_t max_bins,
+                      common::Span<FeatureType const> feature_types, bool use_group);
 
   static bool UseGroup(MetaInfo const &info) {
     size_t const num_groups =
@@ -894,8 +893,8 @@ class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, fl
   using WQSketch = WQuantileSketch<float, float>;
 
  public:
-  HostSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
-                      std::vector<size_t> columns_size, bool use_group, int32_t n_threads);
+  HostSketchContainer(Context const *ctx, bst_bin_t max_bins, common::Span<FeatureType const> ft,
+                      std::vector<size_t> columns_size, bool use_group);
 
   template <typename Batch>
   void PushAdapterBatch(Batch const &batch, size_t base_rowid, MetaInfo const &info, float missing);
@@ -990,10 +989,10 @@ class SortedSketchContainer : public SketchContainerImpl<WXQuantileSketch<float,
   using Super = SketchContainerImpl<WXQuantileSketch<float, float>>;
 
  public:
-  explicit SortedSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
-                                 std::vector<size_t> columns_size, bool use_group,
-                                 int32_t n_threads)
-      : SketchContainerImpl{columns_size, max_bins, ft, use_group, n_threads} {
+  explicit SortedSketchContainer(Context const *ctx, int32_t max_bins,
+                                 common::Span<FeatureType const> ft,
+                                 std::vector<size_t> columns_size, bool use_group)
+      : SketchContainerImpl{ctx, columns_size, max_bins, ft, use_group} {
     monitor_.Init(__func__);
     sketches_.resize(columns_size.size());
     size_t i = 0;
diff --git a/src/data/batch_utils.h b/src/data/batch_utils.h
new file mode 100644
index 000000000..f75d24ffd
--- /dev/null
+++ b/src/data/batch_utils.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#ifndef XGBOOST_DATA_BATCH_UTILS_H_
+#define XGBOOST_DATA_BATCH_UTILS_H_
+
+#include "xgboost/data.h"  // for BatchParam
+
+namespace xgboost::data::detail {
+// At least one batch parameter is initialized.
+inline void CheckEmpty(BatchParam const& l, BatchParam const& r) {
+  if (!l.Initialized()) {
+    CHECK(r.Initialized()) << "Batch parameter is not initialized.";
+  }
+}
+
+/**
+ * \brief Should we regenerate the gradient index?
+ *
+ * \param old Parameter stored in DMatrix.
+ * \param p   New parameter passed in by caller.
+ */
+inline bool RegenGHist(BatchParam old, BatchParam p) {
+  // Parameter is renewed or caller requests a regen
+  if (!p.Initialized()) {
+    // Empty parameter is passed in, don't regenerate so that we can use gindex in
+    // predictor, which doesn't have any training parameter.
+    return false;
+  }
+  return p.regen || old.ParamNotEqual(p);
+}
+}  // namespace xgboost::data::detail
+#endif  // XGBOOST_DATA_BATCH_UTILS_H_
diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
index b1f24506e..1fd8f12b2 100644
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #ifndef XGBOOST_USE_CUDA
 
@@ -12,7 +12,7 @@ class EllpackPageImpl {};
 
 EllpackPage::EllpackPage() = default;
 
-EllpackPage::EllpackPage(DMatrix*, const BatchParam&) {
+EllpackPage::EllpackPage(Context const*, DMatrix*, const BatchParam&) {
   LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
                 "EllpackPage is required";
 }
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index d631407a1..4409a7ebb 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -17,8 +17,8 @@ namespace xgboost {
 
 EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {}
 
-EllpackPage::EllpackPage(DMatrix* dmat, const BatchParam& param)
-    : impl_{new EllpackPageImpl(dmat, param)} {}
+EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param)
+    : impl_{new EllpackPageImpl{ctx, dmat, param}} {}
 
 EllpackPage::~EllpackPage() = default;
 
@@ -105,29 +105,29 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 }
 
 // Construct an ELLPACK matrix in memory.
-EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
+EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : is_dense(dmat->IsDense()) {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(param.gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
 
   n_rows = dmat->Info().num_row_;
 
   monitor_.Start("Quantiles");
   // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
   row_stride = GetRowStride(dmat);
-  cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
+  cuts_ = common::DeviceSketch(ctx->gpu_id, dmat, param.max_bin);
   monitor_.Stop("Quantiles");
 
   monitor_.Start("InitCompressedData");
-  this->InitCompressedData(param.gpu_id);
+  this->InitCompressedData(ctx->gpu_id);
   monitor_.Stop("InitCompressedData");
 
-  dmat->Info().feature_types.SetDevice(param.gpu_id);
+  dmat->Info().feature_types.SetDevice(ctx->gpu_id);
   auto ft = dmat->Info().feature_types.ConstDeviceSpan();
   monitor_.Start("BinningCompression");
   CHECK(dmat->SingleColBlock());
   for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(param.gpu_id, batch, ft);
+    CreateHistIndices(ctx->gpu_id, batch, ft);
   }
   monitor_.Stop("BinningCompression");
 }
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index faf44b3b6..ee6a2c221 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -155,7 +155,7 @@ class EllpackPageImpl {
    * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
    * in CSR format.
    */
-  explicit EllpackPageImpl(DMatrix* dmat, const BatchParam& parm);
+  explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
 
   template <typename AdapterBatch>
   explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 872cb0cc6..fb414f4ae 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <memory>
 #include <utility>
@@ -10,7 +10,7 @@
 namespace xgboost {
 namespace data {
 void EllpackPageSource::Fetch() {
-  dh::safe_cuda(cudaSetDevice(param_.gpu_id));
+  dh::safe_cuda(cudaSetDevice(device_));
   if (!this->ReadCache()) {
     if (count_ != 0 && !sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
@@ -22,8 +22,7 @@ void EllpackPageSource::Fetch() {
     auto const &csr = source_->Page();
     this->page_.reset(new EllpackPage{});
     auto *impl = this->page_->Impl();
-    *impl = EllpackPageImpl(param_.gpu_id, *cuts_, *csr, is_dense_, row_stride_,
-                            feature_types_);
+    *impl = EllpackPageImpl(device_, *cuts_, *csr, is_dense_, row_stride_, feature_types_);
     page_->SetBaseRowId(csr->base_rowid);
     this->WriteCache();
   }
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index dc0802472..121ffcf9e 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  */
 
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
@@ -23,19 +23,21 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
   BatchParam param_;
   common::Span<FeatureType const> feature_types_;
   std::unique_ptr<common::HistogramCuts> cuts_;
+  std::int32_t device_;
 
  public:
   EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
                     std::shared_ptr<Cache> cache, BatchParam param,
                     std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
                     common::Span<FeatureType const> feature_types,
-                    std::shared_ptr<SparsePageSource> source)
+                    std::shared_ptr<SparsePageSource> source, std::int32_t device)
       : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
         is_dense_{is_dense},
         row_stride_{row_stride},
         param_{std::move(param)},
         feature_types_{feature_types},
-        cuts_{std::move(cuts)} {
+        cuts_{std::move(cuts)},
+        device_{device} {
     this->source_ = source;
     this->Fetch();
   }
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 3b3323bb5..11e9a4bec 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
+/**
+ * Copyright 2017-2023, XGBoost Contributors
  * \brief Data type for fast histogram aggregation.
  */
 #include "gradient_index.h"
@@ -19,18 +19,18 @@ namespace xgboost {
 
 GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnMatrix>()} {}
 
-GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
-                                   double sparse_thresh, bool sorted_sketch, int32_t n_threads,
+GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
+                                   double sparse_thresh, bool sorted_sketch,
                                    common::Span<float> hess)
     : max_numeric_bins_per_feat{max_bins_per_feat} {
   CHECK(p_fmat->SingleColBlock());
   // We use sorted sketching for approx tree method since it's more efficient in
   // computation time (but higher memory usage).
-  cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
+  cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);
 
   const uint32_t nbins = cut.Ptrs().back();
   hit_count.resize(nbins, 0);
-  hit_count_tloc_.resize(n_threads * nbins, 0);
+  hit_count_tloc_.resize(ctx->Threads() * nbins, 0);
 
   size_t new_size = 1;
   for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
@@ -45,7 +45,7 @@ GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
   auto ft = p_fmat->Info().feature_types.ConstHostSpan();
 
   for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-    this->PushBatch(batch, ft, n_threads);
+    this->PushBatch(batch, ft, ctx->Threads());
   }
   this->columns_ = std::make_unique<common::ColumnMatrix>();
 
@@ -54,7 +54,7 @@ GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
     // hist
     CHECK(!sorted_sketch);
     for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
-      this->columns_->InitFromSparse(page, *this, sparse_thresh, n_threads);
+      this->columns_->InitFromSparse(page, *this, sparse_thresh, ctx->Threads());
     }
   }
 }
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 4c35870db..d36373d6b 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -19,7 +19,6 @@
 #include "../common/threading_utils.h"
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
-#include "proxy_dmatrix.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 
@@ -155,8 +154,8 @@ class GHistIndexMatrix {
   /**
    * \brief Constrcutor for SimpleDMatrix.
    */
-  GHistIndexMatrix(DMatrix* x, bst_bin_t max_bins_per_feat, double sparse_thresh,
-                   bool sorted_sketch, int32_t n_threads, common::Span<float> hess = {});
+  GHistIndexMatrix(Context const* ctx, DMatrix* x, bst_bin_t max_bins_per_feat,
+                   double sparse_thresh, bool sorted_sketch, common::Span<float> hess = {});
   /**
    * \brief Constructor for Iterative DMatrix. Initialize basic information and prepare
    *        for push batch.
@@ -295,28 +294,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
     }
   });
 }
-
-/**
- * \brief Should we regenerate the gradient index?
- *
- * \param old Parameter stored in DMatrix.
- * \param p   New parameter passed in by caller.
- */
-inline bool RegenGHist(BatchParam old, BatchParam p) {
-  // parameter is renewed or caller requests a regen
-  if (p == BatchParam{}) {
-    // empty parameter is passed in, don't regenerate so that we can use gindex in
-    // predictor, which doesn't have any training parameter.
-    return false;
-  }
-
-  // Avoid comparing nan values.
-  bool l_nan = std::isnan(old.sparse_thresh);
-  bool r_nan = std::isnan(p.sparse_thresh);
-  // regenerate if parameter is changed.
-  bool st_chg = (l_nan != r_nan) || (!l_nan && !r_nan && (old.sparse_thresh != p.sparse_thresh));
-  bool param_chg = old.gpu_id != p.gpu_id || old.max_bin != p.max_bin;
-  return p.regen || param_chg || st_chg;
-}
 }      // namespace xgboost
 #endif  // XGBOOST_DATA_GRADIENT_INDEX_H_
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 3a473122a..8eb1c2034 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -1,25 +1,26 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023, XGBoost contributors
  */
 #include "iterative_dmatrix.h"
 
-#include <algorithm>    // std::copy
-#include <cstddef>      // std::size_t
-#include <type_traits>  // std::underlying_type_t
-#include <vector>       // std::vector
+#include <algorithm>    // for copy
+#include <cstddef>      // for size_t
+#include <memory>       // for shared_ptr
+#include <type_traits>  // for underlying_type_t
+#include <vector>       // for vector
 
 #include "../collective/communicator-inl.h"
 #include "../common/categorical.h"  // common::IsCat
 #include "../common/column_matrix.h"
-#include "../tree/param.h"  // FIXME(jiamingy): Find a better way to share this parameter.
+#include "../tree/param.h"          // FIXME(jiamingy): Find a better way to share this parameter.
+#include "batch_utils.h"            // for RegenGHist
 #include "gradient_index.h"
 #include "proxy_dmatrix.h"
 #include "simple_batch_iterator.h"
-#include "xgboost/data.h"  // FeatureType
+#include "xgboost/data.h"  // for FeatureType, DMatrix
 #include "xgboost/logging.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy,
                                    std::shared_ptr<DMatrix> ref, DataIterResetCallback* reset,
                                    XGDMatrixCallbackNext* next, float missing, int nthread,
@@ -34,60 +35,61 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
 
   auto d = MakeProxy(proxy_)->DeviceIdx();
 
-  StringView msg{"All batch should be on the same device."};
-  if (batch_param_.gpu_id != Context::kCpuId) {
-    CHECK_EQ(d, batch_param_.gpu_id) << msg;
-  }
-
-  batch_param_ = BatchParam{d, max_bin};
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
   // hardcoded parameter.
-  batch_param_.sparse_thresh = tree::TrainParam::DftSparseThreshold();
+  BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
-  ctx_.UpdateAllowUnknown(
-      Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
-  if (ctx_.IsCPU()) {
-    this->InitFromCPU(iter_handle, missing, ref);
+  if (ctx.IsCPU()) {
+    this->InitFromCPU(&ctx, p, iter_handle, missing, ref);
   } else {
-    this->InitFromCUDA(iter_handle, missing, ref);
+    this->InitFromCUDA(&ctx, p, iter_handle, missing, ref);
   }
+
+  this->fmat_ctx_ = ctx;
+  this->batch_ = p;
 }
 
-void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, BatchParam p,
-                    common::HistogramCuts* p_cuts) {
-  CHECK(ref_);
+void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_feature_t n_features,
+                    BatchParam p, common::HistogramCuts* p_cuts) {
+  CHECK(ref);
   CHECK(p_cuts);
-  auto csr = [&]() {
-    for (auto const& page : ref_->GetBatches<GHistIndexMatrix>(p)) {
+  p.forbid_regen = true;
+  // Fetch cuts from GIDX
+  auto csr = [&] {
+    for (auto const& page : ref->GetBatches<GHistIndexMatrix>(ctx, p)) {
       *p_cuts = page.cut;
       break;
     }
   };
-  auto ellpack = [&]() {
-    // workaround ellpack being initialized from CPU.
-    if (p.gpu_id == Context::kCpuId) {
-      p.gpu_id = ref_->Ctx()->gpu_id;
-    }
-    if (p.gpu_id == Context::kCpuId) {
-      p.gpu_id = 0;
-    }
-    for (auto const& page : ref_->GetBatches<EllpackPage>(p)) {
+  // Fetch cuts from Ellpack.
+  auto ellpack = [&] {
+    for (auto const& page : ref->GetBatches<EllpackPage>(ctx, p)) {
       GetCutsFromEllpack(page, p_cuts);
       break;
     }
   };
 
-  if (ref_->PageExists<GHistIndexMatrix>()) {
+  if (ref->PageExists<GHistIndexMatrix>() && ref->PageExists<EllpackPage>()) {
+    // Both exists
+    if (ctx->IsCPU()) {
+      csr();
+    } else {
+      ellpack();
+    }
+  } else if (ref->PageExists<GHistIndexMatrix>()) {
     csr();
-  } else if (ref_->PageExists<EllpackPage>()) {
+  } else if (ref->PageExists<EllpackPage>()) {
     ellpack();
   } else {
-    if (p.gpu_id == Context::kCpuId) {
+    // None exist
+    if (ctx->IsCPU()) {
       csr();
     } else {
       ellpack();
     }
   }
-  CHECK_EQ(ref_->Info().num_col_, n_features)
+  CHECK_EQ(ref->Info().num_col_, n_features)
       << "Invalid ref DMatrix, different number of features.";
 }
 
@@ -112,7 +114,8 @@ void SyncFeatureType(std::vector<FeatureType>* p_h_ft) {
 }
 }  // anonymous namespace
 
-void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
+void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
+                                   DataIterHandle iter_handle, float missing,
                                    std::shared_ptr<DMatrix> ref) {
   DMatrixProxy* proxy = MakeProxy(proxy_);
   CHECK(proxy);
@@ -133,7 +136,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   auto const is_valid = data::IsValidFunctor{missing};
   auto nnz_cnt = [&]() {
     return HostAdapterDispatch(proxy, [&](auto const& value) {
-      size_t n_threads = ctx_.Threads();
+      size_t n_threads = ctx->Threads();
       size_t n_features = column_sizes.size();
       linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
       column_sizes_tloc.Data()->Fill(0ul);
@@ -158,10 +161,10 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
     });
   };
 
-  size_t n_features = 0;
-  size_t n_batches = 0;
-  size_t accumulated_rows{0};
-  size_t nnz{0};
+  std::uint64_t n_features = 0;
+  std::size_t n_batches = 0;
+  std::uint64_t accumulated_rows{0};
+  std::uint64_t nnz{0};
 
   /**
    * CPU impl needs an additional loop for accumulating the column size.
@@ -203,7 +206,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   accumulated_rows = 0;
   std::vector<FeatureType> h_ft;
   if (ref) {
-    GetCutsFromRef(ref, Info().num_col_, batch_param_, &cuts);
+    GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
     h_ft = ref->Info().feature_types.HostVector();
   } else {
     size_t i = 0;
@@ -211,9 +214,8 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
       if (!p_sketch) {
         h_ft = proxy->Info().feature_types.ConstHostVector();
         SyncFeatureType(&h_ft);
-        p_sketch.reset(new common::HostSketchContainer{
-            batch_param_.max_bin, h_ft, column_sizes, !proxy->Info().group_ptr_.empty(),
-            ctx_.Threads()});
+        p_sketch.reset(new common::HostSketchContainer{ctx, p.max_bin, h_ft, column_sizes,
+                                                       !proxy->Info().group_ptr_.empty()});
       }
       HostAdapterDispatch(proxy, [&](auto const& batch) {
         proxy->Info().num_nonzero_ = batch_nnz[i];
@@ -237,15 +239,15 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   /**
    * Generate gradient index.
    */
-  this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), batch_param_.max_bin);
+  this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
   size_t rbegin = 0;
   size_t prev_sum = 0;
   size_t i = 0;
   while (iter.Next()) {
     HostAdapterDispatch(proxy, [&](auto const& batch) {
       proxy->Info().num_nonzero_ = batch_nnz[i];
-      this->ghist_->PushAdapterBatch(&ctx_, rbegin, prev_sum, batch, missing, h_ft,
-                                     batch_param_.sparse_thresh, Info().num_row_);
+      this->ghist_->PushAdapterBatch(ctx, rbegin, prev_sum, batch, missing, h_ft, p.sparse_thresh,
+                                     Info().num_row_);
     });
     if (n_batches != 1) {
       this->info_.Extend(std::move(proxy->Info()), false, true);
@@ -265,7 +267,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   accumulated_rows = 0;
   while (iter.Next()) {
     HostAdapterDispatch(proxy, [&](auto const& batch) {
-      this->ghist_->PushAdapterBatchColumns(&ctx_, batch, missing, accumulated_rows);
+      this->ghist_->PushAdapterBatchColumns(ctx, batch, missing, accumulated_rows);
     });
     accumulated_rows += num_rows();
   }
@@ -282,11 +284,27 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
   Info().feature_types.HostVector() = h_ft;
 }
 
-BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(BatchParam const& param) {
-  CheckParam(param);
+BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(Context const* ctx,
+                                                              BatchParam const& param) {
+  if (param.Initialized()) {
+    CheckParam(param);
+    CHECK(!detail::RegenGHist(param, batch_)) << error::InconsistentMaxBin();
+  }
+  if (!ellpack_ && !ghist_) {
+    LOG(FATAL) << "`QuantileDMatrix` not initialized.";
+  }
+
   if (!ghist_) {
-    CHECK(ellpack_);
-    ghist_ = std::make_shared<GHistIndexMatrix>(&ctx_, Info(), *ellpack_, param);
+    if (ctx->IsCPU()) {
+      ghist_ = std::make_shared<GHistIndexMatrix>(ctx, Info(), *ellpack_, param);
+    } else if (fmat_ctx_.IsCPU()) {
+      ghist_ = std::make_shared<GHistIndexMatrix>(&fmat_ctx_, Info(), *ellpack_, param);
+    } else {
+      // Can happen when QDM is initialized on GPU, but a CPU version is queried by a different QDM
+      // for cut reference.
+      auto cpu_ctx = ctx->MakeCPU();
+      ghist_ = std::make_shared<GHistIndexMatrix>(&cpu_ctx, Info(), *ellpack_, param);
+    }
   }
 
   if (!std::isnan(param.sparse_thresh) &&
@@ -300,8 +318,9 @@ BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(BatchParam const&
   return BatchSet<GHistIndexMatrix>(begin_iter);
 }
 
-BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(BatchParam const& param) {
-  for (auto const& page : this->GetGradientIndex(param)) {
+BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(Context const* ctx,
+                                                        BatchParam const& param) {
+  for (auto const& page : this->GetGradientIndex(ctx, param)) {
     auto p_out = std::make_shared<SparsePage>();
     p_out->data.Resize(this->Info().num_nonzero_);
     p_out->offset.Resize(this->Info().num_row_ + 1);
@@ -336,5 +355,26 @@ BatchSet<ExtSparsePage> IterativeDMatrix::GetExtBatches(BatchParam const& param)
       BatchIterator<ExtSparsePage>(new SimpleBatchIteratorImpl<ExtSparsePage>(nullptr));
   return BatchSet<ExtSparsePage>(begin_iter);
 }
-}  // namespace data
-}  // namespace xgboost
+
+#if !defined(XGBOOST_USE_CUDA)
+inline void IterativeDMatrix::InitFromCUDA(Context const*, BatchParam const&, DataIterHandle, float,
+                                           std::shared_ptr<DMatrix>) {
+  // silent the warning about unused variables.
+  (void)(proxy_);
+  (void)(reset_);
+  (void)(next_);
+  common::AssertGPUSupport();
+}
+
+inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
+                                                                 BatchParam const& param) {
+  common::AssertGPUSupport();
+  auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
+  return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
+}
+
+inline void GetCutsFromEllpack(EllpackPage const&, common::HistogramCuts*) {
+  common::AssertGPUSupport();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::data
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 5e7fc8d4f..f23bbd5a1 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -1,22 +1,24 @@
-/*!
- * Copyright 2020-2022 XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #include <algorithm>
 #include <memory>
 #include <type_traits>
 
 #include "../common/hist_util.cuh"
+#include "batch_utils.h"  // for RegenGHist
 #include "device_adapter.cuh"
 #include "ellpack_page.cuh"
+#include "gradient_index.h"
 #include "iterative_dmatrix.h"
 #include "proxy_dmatrix.cuh"
 #include "proxy_dmatrix.h"
 #include "simple_batch_iterator.h"
 #include "sparse_page_source.h"
 
-namespace xgboost {
-namespace data {
-void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
+namespace xgboost::data {
+void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
+                                    DataIterHandle iter_handle, float missing,
                                     std::shared_ptr<DMatrix> ref) {
   // A handle passed to external iterator.
   DMatrixProxy* proxy = MakeProxy(proxy_);
@@ -46,7 +48,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
   int32_t current_device;
   dh::safe_cuda(cudaGetDevice(&current_device));
   auto get_device = [&]() -> int32_t {
-    int32_t d = (ctx_.gpu_id == Context::kCpuId) ? current_device : ctx_.gpu_id;
+    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
     CHECK_NE(d, Context::kCpuId);
     return d;
   };
@@ -57,8 +59,8 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
   common::HistogramCuts cuts;
   do {
     // We use do while here as the first batch is fetched in ctor
-    ctx_.gpu_id = proxy->DeviceIdx();
-    CHECK_LT(ctx_.gpu_id, common::AllVisibleGPUs());
+    // ctx_.gpu_id = proxy->DeviceIdx();
+    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
     dh::safe_cuda(cudaSetDevice(get_device()));
     if (cols == 0) {
       cols = num_cols();
@@ -68,12 +70,12 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
       CHECK_EQ(cols, num_cols()) << "Inconsistent number of columns.";
     }
     if (!ref) {
-      sketch_containers.emplace_back(proxy->Info().feature_types, batch_param_.max_bin, cols,
-                                     num_rows(), get_device());
+      sketch_containers.emplace_back(proxy->Info().feature_types, p.max_bin, cols, num_rows(),
+                                     get_device());
       auto* p_sketch = &sketch_containers.back();
       proxy->Info().weights_.SetDevice(get_device());
       Dispatch(proxy, [&](auto const& value) {
-        common::AdapterDeviceSketch(value, batch_param_.max_bin, proxy->Info(), missing, p_sketch);
+        common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
       });
     }
     auto batch_rows = num_rows();
@@ -95,8 +97,8 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
   if (!ref) {
     HostDeviceVector<FeatureType> ft;
     common::SketchContainer final_sketch(
-        sketch_containers.empty() ? ft : sketch_containers.front().FeatureTypes(),
-        batch_param_.max_bin, cols, accumulated_rows, get_device());
+        sketch_containers.empty() ? ft : sketch_containers.front().FeatureTypes(), p.max_bin, cols,
+        accumulated_rows, get_device());
     for (auto const& sketch : sketch_containers) {
       final_sketch.Merge(sketch.ColumnsPtr(), sketch.Data());
       final_sketch.FixError();
@@ -106,7 +108,7 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
 
     final_sketch.MakeCuts(&cuts);
   } else {
-    GetCutsFromRef(ref, Info().num_col_, batch_param_, &cuts);
+    GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
   }
 
   this->info_.num_row_ = accumulated_rows;
@@ -169,24 +171,34 @@ void IterativeDMatrix::InitFromCUDA(DataIterHandle iter_handle, float missing,
   info_.SynchronizeNumberOfColumns();
 }
 
-BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& param) {
-  CheckParam(param);
+BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
+                                                          BatchParam const& param) {
+  if (param.Initialized()) {
+    CheckParam(param);
+    CHECK(!detail::RegenGHist(param, batch_)) << error::InconsistentMaxBin();
+  }
   if (!ellpack_ && !ghist_) {
     LOG(FATAL) << "`QuantileDMatrix` not initialized.";
   }
-  if (!ellpack_ && ghist_) {
+
+  if (!ellpack_) {
     ellpack_.reset(new EllpackPage());
-    // Evaluation QuantileDMatrix initialized from CPU data might not have the correct GPU
-    // ID.
-    if (this->ctx_.IsCPU()) {
-      this->ctx_.gpu_id = param.gpu_id;
+    if (ctx->IsCUDA()) {
+      this->Info().feature_types.SetDevice(ctx->gpu_id);
+      *ellpack_->Impl() =
+          EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
+    } else if (fmat_ctx_.IsCUDA()) {
+      this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
+      *ellpack_->Impl() =
+          EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
+    } else {
+      // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
+      // for cut reference.
+      auto cuda_ctx = ctx->MakeCUDA();
+      this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
+      *ellpack_->Impl() =
+          EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     }
-    if (this->ctx_.IsCPU()) {
-      this->ctx_.gpu_id = dh::CurrentDevice();
-    }
-    this->Info().feature_types.SetDevice(this->ctx_.gpu_id);
-    *ellpack_->Impl() =
-        EllpackPageImpl(&ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
   }
   CHECK(ellpack_);
   auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
@@ -196,5 +208,4 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& para
 void GetCutsFromEllpack(EllpackPage const& page, common::HistogramCuts* cuts) {
   *cuts = page.Impl()->Cuts();
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h
index 28c4087c4..bcaa5b63c 100644
--- a/src/data/iterative_dmatrix.h
+++ b/src/data/iterative_dmatrix.h
@@ -1,6 +1,8 @@
-/*!
- * Copyright 2020-2022 by Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  * \file iterative_dmatrix.h
+ *
+ * \brief Implementation of the higher-level `QuantileDMatrix`.
  */
 #ifndef XGBOOST_DATA_ITERATIVE_DMATRIX_H_
 #define XGBOOST_DATA_ITERATIVE_DMATRIX_H_
@@ -10,10 +12,12 @@
 #include <utility>
 #include <vector>
 
+#include "../common/error_msg.h"
 #include "proxy_dmatrix.h"
 #include "simple_batch_iterator.h"
 #include "xgboost/base.h"
 #include "xgboost/c_api.h"
+#include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"
 
 namespace xgboost {
@@ -43,21 +47,17 @@ namespace data {
  */
 class IterativeDMatrix : public DMatrix {
   MetaInfo info_;
-  Context ctx_;
-  BatchParam batch_param_;
   std::shared_ptr<EllpackPage> ellpack_;
   std::shared_ptr<GHistIndexMatrix> ghist_;
+  BatchParam batch_;
 
   DMatrixHandle proxy_;
   DataIterResetCallback *reset_;
   XGDMatrixCallbackNext *next_;
+  Context fmat_ctx_;
 
   void CheckParam(BatchParam const &param) {
-    // FIXME(Jiamingy): https://github.com/dmlc/xgboost/issues/7976
-    if (param.max_bin != batch_param_.max_bin && param.max_bin != 0) {
-      LOG(WARNING) << "Inconsistent max_bin between Quantile DMatrix and Booster:" << param.max_bin
-                   << " vs. " << batch_param_.max_bin;
-    }
+    CHECK_EQ(param.max_bin, batch_.max_bin) << error::InconsistentMaxBin();
     CHECK(!param.regen && param.hess.empty())
         << "Only `hist` and `gpu_hist` tree method can use `QuantileDMatrix`.";
   }
@@ -68,8 +68,10 @@ class IterativeDMatrix : public DMatrix {
     return BatchSet<Page>(BatchIterator<Page>(nullptr));
   }
 
-  void InitFromCUDA(DataIterHandle iter, float missing, std::shared_ptr<DMatrix> ref);
-  void InitFromCPU(DataIterHandle iter_handle, float missing, std::shared_ptr<DMatrix> ref);
+  void InitFromCUDA(Context const *ctx, BatchParam const &p, DataIterHandle iter_handle,
+                    float missing, std::shared_ptr<DMatrix> ref);
+  void InitFromCPU(Context const *ctx, BatchParam const &p, DataIterHandle iter_handle,
+                   float missing, std::shared_ptr<DMatrix> ref);
 
  public:
   explicit IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy,
@@ -94,51 +96,40 @@ class IterativeDMatrix : public DMatrix {
     LOG(FATAL) << "Not implemented.";
     return BatchSet<SparsePage>(BatchIterator<SparsePage>(nullptr));
   }
-  BatchSet<CSCPage> GetColumnBatches() override { return InvalidTreeMethod<CSCPage>(); }
-  BatchSet<SortedCSCPage> GetSortedColumnBatches() override {
+  BatchSet<CSCPage> GetColumnBatches(Context const *) override {
+    return InvalidTreeMethod<CSCPage>();
+  }
+  BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const *) override {
     return InvalidTreeMethod<SortedCSCPage>();
   }
-  BatchSet<GHistIndexMatrix> GetGradientIndex(BatchParam const &param) override;
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const *ctx, BatchParam const &param) override;
 
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam &param) override;
-  BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) override;
+  BatchSet<EllpackPage> GetEllpackBatches(Context const *ctx, const BatchParam &param) override;
+  BatchSet<ExtSparsePage> GetExtBatches(Context const *ctx, BatchParam const &param) override;
 
   bool SingleColBlock() const override { return true; }
 
   MetaInfo &Info() override { return info_; }
   MetaInfo const &Info() const override { return info_; }
 
-  Context const *Ctx() const override { return &ctx_; }
+  Context const *Ctx() const override { return &fmat_ctx_; }
 };
 
 /**
- * \brief Get quantile cuts from reference Quantile DMatrix.
+ * \brief Get quantile cuts from reference (Quantile)DMatrix.
+ *
+ * \param ctx The context of the new DMatrix.
+ * \param ref The reference DMatrix.
+ * \param n_features Number of features, used for validation only.
+ * \param p Batch parameter for the new DMatrix.
+ * \param p_cuts Output quantile cuts.
  */
-void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, BatchParam p,
-                    common::HistogramCuts *p_cuts);
+void GetCutsFromRef(Context const *ctx, std::shared_ptr<DMatrix> ref, bst_feature_t n_features,
+                    BatchParam p, common::HistogramCuts *p_cuts);
 /**
  * \brief Get quantile cuts from ellpack page.
  */
 void GetCutsFromEllpack(EllpackPage const &page, common::HistogramCuts *cuts);
-
-#if !defined(XGBOOST_USE_CUDA)
-inline void IterativeDMatrix::InitFromCUDA(DataIterHandle, float, std::shared_ptr<DMatrix>) {
-  // silent the warning about unused variables.
-  (void)(proxy_);
-  (void)(reset_);
-  (void)(next_);
-  common::AssertGPUSupport();
-}
-inline BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(const BatchParam &) {
-  common::AssertGPUSupport();
-  auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));
-  return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
-}
-
-inline void GetCutsFromEllpack(EllpackPage const &, common::HistogramCuts *) {
-  common::AssertGPUSupport();
-}
-#endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace data
 }  // namespace xgboost
 
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 7a15d6498..2c18ffc79 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -25,16 +25,11 @@ class DataIterProxy {
   NextFn* next_;
 
  public:
-  DataIterProxy(DataIterHandle iter, ResetFn* reset, NextFn* next) :
-      iter_{iter},
-      reset_{reset}, next_{next} {}
+  DataIterProxy(DataIterHandle iter, ResetFn* reset, NextFn* next)
+      : iter_{iter}, reset_{reset}, next_{next} {}
 
-  bool Next() {
-    return next_(iter_);
-  }
-  void Reset() {
-    reset_(iter_);
-  }
+  bool Next() { return next_(iter_); }
+  void Reset() { reset_(iter_); }
 };
 
 /*
@@ -68,9 +63,8 @@ class DMatrixProxy : public DMatrix {
   }
 
   void SetArrayData(char const* c_interface);
-  void SetCSRData(char const *c_indptr, char const *c_indices,
-                  char const *c_values, bst_feature_t n_features,
-                  bool on_host);
+  void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
+                  bst_feature_t n_features, bool on_host);
 
   MetaInfo& Info() override { return info_; }
   MetaInfo const& Info() const override { return info_; }
@@ -81,6 +75,12 @@ class DMatrixProxy : public DMatrix {
   bool GHistIndexExists() const override { return false; }
   bool SparsePageExists() const override { return false; }
 
+  template <typename Page>
+  BatchSet<Page> NoBatch() {
+    LOG(FATAL) << "Proxy DMatrix cannot return data batch.";
+    return BatchSet<Page>(BatchIterator<Page>(nullptr));
+  }
+
   DMatrix* Slice(common::Span<int32_t const> /*ridxs*/) override {
     LOG(FATAL) << "Slicing DMatrix is not supported for Proxy DMatrix.";
     return nullptr;
@@ -89,29 +89,19 @@ class DMatrixProxy : public DMatrix {
     LOG(FATAL) << "Slicing DMatrix columns is not supported for Proxy DMatrix.";
     return nullptr;
   }
-  BatchSet<SparsePage> GetRowBatches() override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<SparsePage>(BatchIterator<SparsePage>(nullptr));
+  BatchSet<SparsePage> GetRowBatches() override { return NoBatch<SparsePage>(); }
+  BatchSet<CSCPage> GetColumnBatches(Context const*) override { return NoBatch<CSCPage>(); }
+  BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const*) override {
+    return NoBatch<SortedCSCPage>();
   }
-  BatchSet<CSCPage> GetColumnBatches() override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<CSCPage>(BatchIterator<CSCPage>(nullptr));
+  BatchSet<EllpackPage> GetEllpackBatches(Context const*, BatchParam const&) override {
+    return NoBatch<EllpackPage>();
   }
-  BatchSet<SortedCSCPage> GetSortedColumnBatches() override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(nullptr));
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const*, BatchParam const&) override {
+    return NoBatch<GHistIndexMatrix>();
   }
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam&) override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(nullptr));
-  }
-  BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam&) override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<GHistIndexMatrix>(BatchIterator<GHistIndexMatrix>(nullptr));
-  }
-  BatchSet<ExtSparsePage> GetExtBatches(BatchParam const&) override {
-    LOG(FATAL) << "Not implemented.";
-    return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
+  BatchSet<ExtSparsePage> GetExtBatches(Context const*, BatchParam const&) override {
+    return NoBatch<ExtSparsePage>();
   }
   std::any Adapter() const { return batch_; }
 };
@@ -144,8 +134,7 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
     } else {
       LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
     }
-    return std::result_of_t<Fn(
-        decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
+    return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
   }
 }
 }  // namespace xgboost::data
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index e916311a5..ab75cf03e 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -11,10 +11,12 @@
 #include <type_traits>
 #include <vector>
 
+#include "../common/error_msg.h"  // for InconsistentMaxBin
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "./simple_batch_iterator.h"
 #include "adapter.h"
+#include "batch_utils.h"  // for CheckEmpty, RegenGHist
 #include "gradient_index.h"
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"
@@ -28,7 +30,7 @@ const MetaInfo& SimpleDMatrix::Info() const { return info_; }
 DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
   auto out = new SimpleDMatrix;
   SparsePage& out_page = *out->sparse_page_;
-  for (auto const &page : this->GetBatches<SparsePage>()) {
+  for (auto const& page : this->GetBatches<SparsePage>()) {
     auto batch = page.GetView();
     auto& h_data = out_page.data.HostVector();
     auto& h_offset = out_page.offset.HostVector();
@@ -42,7 +44,7 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
     out->Info() = this->Info().Slice(ridxs);
     out->Info().num_nonzero_ = h_offset.back();
   }
-  out->ctx_ = this->ctx_;
+  out->fmat_ctx_ = this->fmat_ctx_;
   return out;
 }
 
@@ -52,7 +54,7 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
   auto const slice_size = info_.num_col_ / num_slices;
   auto const slice_start = slice_size * slice_id;
   auto const slice_end = (slice_id == num_slices - 1) ? info_.num_col_ : slice_start + slice_size;
-  for (auto const &page : this->GetBatches<SparsePage>()) {
+  for (auto const& page : this->GetBatches<SparsePage>()) {
     auto batch = page.GetView();
     auto& h_data = out_page.data.HostVector();
     auto& h_offset = out_page.offset.HostVector();
@@ -60,9 +62,8 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
     for (bst_row_t i = 0; i < this->Info().num_row_; i++) {
       auto inst = batch[i];
       auto prev_size = h_data.size();
-      std::copy_if(inst.begin(), inst.end(), std::back_inserter(h_data), [&](Entry e) {
-        return e.index >= slice_start && e.index < slice_end;
-      });
+      std::copy_if(inst.begin(), inst.end(), std::back_inserter(h_data),
+                   [&](Entry e) { return e.index >= slice_start && e.index < slice_end; });
       rptr += h_data.size() - prev_size;
       h_offset.emplace_back(rptr);
     }
@@ -73,7 +74,7 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
   return out;
 }
 
-void SimpleDMatrix::ReindexFeatures() {
+void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
   if (info_.IsVerticalFederated()) {
     std::vector<uint64_t> buffer(collective::GetWorldSize());
     buffer[collective::GetRank()] = info_.num_col_;
@@ -82,72 +83,115 @@ void SimpleDMatrix::ReindexFeatures() {
     if (offset == 0) {
       return;
     }
-    sparse_page_->Reindex(offset, ctx_.Threads());
+    sparse_page_->Reindex(offset, ctx->Threads());
   }
 }
 
 BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
   // since csr is the default data structure so `source_` is always available.
-  auto begin_iter = BatchIterator<SparsePage>(
-      new SimpleBatchIteratorImpl<SparsePage>(sparse_page_));
+  auto begin_iter =
+      BatchIterator<SparsePage>(new SimpleBatchIteratorImpl<SparsePage>(sparse_page_));
   return BatchSet<SparsePage>(begin_iter);
 }
 
-BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches() {
+BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
   // column page doesn't exist, generate it
   if (!column_page_) {
-    column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx_.Threads())));
+    column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
   }
-  auto begin_iter =
-      BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
+  auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
   return BatchSet<CSCPage>(begin_iter);
 }
 
-BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches() {
+BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
   // Sorted column page doesn't exist, generate it
   if (!sorted_column_page_) {
     sorted_column_page_.reset(
-        new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx_.Threads())));
-    sorted_column_page_->SortRows(ctx_.Threads());
+        new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
+    sorted_column_page_->SortRows(ctx->Threads());
   }
-  auto begin_iter = BatchIterator<SortedCSCPage>(
-      new SimpleBatchIteratorImpl<SortedCSCPage>(sorted_column_page_));
+  auto begin_iter =
+      BatchIterator<SortedCSCPage>(new SimpleBatchIteratorImpl<SortedCSCPage>(sorted_column_page_));
   return BatchSet<SortedCSCPage>(begin_iter);
 }
 
-namespace {
-void CheckEmpty(BatchParam const& l, BatchParam const& r) {
-  if (l == BatchParam{}) {
-    CHECK(r != BatchParam{}) << "Batch parameter is not initialized.";
+BatchSet<EllpackPage> SimpleDMatrix::GetEllpackBatches(Context const* ctx,
+                                                       const BatchParam& param) {
+  detail::CheckEmpty(batch_param_, param);
+  if (ellpack_page_ && param.Initialized() && param.forbid_regen) {
+    if (detail::RegenGHist(batch_param_, param)) {
+      CHECK_EQ(batch_param_.max_bin, param.max_bin) << error::InconsistentMaxBin();
+    }
+    CHECK(!detail::RegenGHist(batch_param_, param));
   }
-}
-}  // anonymous namespace
-
-BatchSet<EllpackPage> SimpleDMatrix::GetEllpackBatches(const BatchParam& param) {
-  // ELLPACK page doesn't exist, generate it
-  CheckEmpty(batch_param_, param);
-  if (!ellpack_page_ || RegenGHist(batch_param_, param)) {
-    CHECK_GE(param.gpu_id, 0);
+  if (!ellpack_page_ || detail::RegenGHist(batch_param_, param)) {
+    // ELLPACK page doesn't exist, generate it
+    LOG(INFO) << "Generating new Ellpack page.";
+    // These places can ask for a ellpack page:
+    // - GPU hist: the ctx must be on CUDA.
+    // - IterativeDMatrix::InitFromCUDA: The ctx must be on CUDA.
+    // - IterativeDMatrix::InitFromCPU: It asks for ellpack only if it exists. It should
+    //   not regen, otherwise it indicates a mismatched parameter like max_bin.
     CHECK_GE(param.max_bin, 2);
-    ellpack_page_.reset(new EllpackPage(this, param));
-    batch_param_ = param;
+    if (ctx->IsCUDA()) {
+      // The context passed in is on GPU, we pick it first since we prioritize the context
+      // in Booster.
+      ellpack_page_.reset(new EllpackPage(ctx, this, param));
+    } else if (fmat_ctx_.IsCUDA()) {
+      // DMatrix was initialized on GPU, we use the context from initialization.
+      ellpack_page_.reset(new EllpackPage(&fmat_ctx_, this, param));
+    } else {
+      // Mismatched parameter, user set a new max_bin during training.
+      auto cuda_ctx = ctx->MakeCUDA();
+      ellpack_page_.reset(new EllpackPage(&cuda_ctx, this, param));
+    }
+
+    batch_param_ = param.MakeCache();
   }
   auto begin_iter =
       BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_page_));
   return BatchSet<EllpackPage>(begin_iter);
 }
 
-BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(const BatchParam& param) {
-  CheckEmpty(batch_param_, param);
-  if (!gradient_index_ || RegenGHist(batch_param_, param)) {
+BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(Context const* ctx,
+                                                           const BatchParam& param) {
+  detail::CheckEmpty(batch_param_, param);
+  // Check whether we can regenerate the gradient index. This is to keep the consistency
+  // between evaluation data and training data.
+  if (gradient_index_ && param.Initialized() && param.forbid_regen) {
+    if (detail::RegenGHist(batch_param_, param)) {
+      CHECK_EQ(batch_param_.max_bin, param.max_bin) << error::InconsistentMaxBin();
+    }
+    CHECK(!detail::RegenGHist(batch_param_, param)) << "Inconsistent sparse threshold.";
+  }
+  if (!gradient_index_ || detail::RegenGHist(batch_param_, param)) {
+    // GIDX page doesn't exist, generate it
     LOG(INFO) << "Generating new Gradient Index.";
+    // These places can ask for a CSR gidx:
+    // - CPU Hist: the ctx must be on CPU.
+    // - IterativeDMatrix::InitFromCPU: The ctx must be on CPU.
+    // - IterativeDMatrix::InitFromCUDA: It asks for gidx only if it exists. It should not
+    //   regen, otherwise it indicates a mismatched parameter like max_bin.
     CHECK_GE(param.max_bin, 2);
-    CHECK_EQ(param.gpu_id, -1);
     // Used only by approx.
     auto sorted_sketch = param.regen;
-    gradient_index_.reset(new GHistIndexMatrix(this, param.max_bin, param.sparse_thresh,
-                                               sorted_sketch, this->ctx_.Threads(), param.hess));
-    batch_param_ = param;
+    if (ctx->IsCPU()) {
+      // The context passed in is on CPU, we pick it first since we prioritize the context
+      // in Booster.
+      gradient_index_.reset(new GHistIndexMatrix{ctx, this, param.max_bin, param.sparse_thresh,
+                                                 sorted_sketch, param.hess});
+    } else if (fmat_ctx_.IsCPU()) {
+      // DMatrix was initialized on CPU, we use the context from initialization.
+      gradient_index_.reset(new GHistIndexMatrix{&fmat_ctx_, this, param.max_bin,
+                                                 param.sparse_thresh, sorted_sketch, param.hess});
+    } else {
+      // Mismatched parameter, user set a new max_bin during training.
+      auto cpu_ctx = ctx->MakeCPU();
+      gradient_index_.reset(new GHistIndexMatrix{&cpu_ctx, this, param.max_bin, param.sparse_thresh,
+                                                 sorted_sketch, param.hess});
+    }
+
+    batch_param_ = param.MakeCache();
     CHECK_EQ(batch_param_.hess.data(), param.hess.data());
   }
   auto begin_iter = BatchIterator<GHistIndexMatrix>(
@@ -155,7 +199,7 @@ BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(const BatchParam& par
   return BatchSet<GHistIndexMatrix>(begin_iter);
 }
 
-BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(BatchParam const&) {
+BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(Context const*, BatchParam const&) {
   auto casted = std::make_shared<ExtSparsePage>(sparse_page_);
   CHECK(casted);
   auto begin_iter =
@@ -166,7 +210,8 @@ BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(BatchParam const&) {
 template <typename AdapterT>
 SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
                              DataSplitMode data_split_mode) {
-  this->ctx_.nthread = nthread;
+  Context ctx;
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}});
 
   std::vector<uint64_t> qids;
   uint64_t default_max = std::numeric_limits<uint64_t>::max();
@@ -176,13 +221,13 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   auto& data_vec = sparse_page_->data.HostVector();
   uint64_t inferred_num_columns = 0;
   uint64_t total_batch_size = 0;
-    // batch_size is either number of rows or cols, depending on data layout
+  // batch_size is either number of rows or cols, depending on data layout
 
   adapter->BeforeFirst();
   // Iterate over batches of input data
   while (adapter->Next()) {
     auto& batch = adapter->Value();
-    auto batch_max_columns = sparse_page_->Push(batch, missing, ctx_.Threads());
+    auto batch_max_columns = sparse_page_->Push(batch, missing, ctx.Threads());
     inferred_num_columns = std::max(batch_max_columns, inferred_num_columns);
     total_batch_size += batch.Size();
     // Append meta information if available
@@ -229,19 +274,18 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
     info_.num_col_ = adapter->NumColumns();
   }
 
-
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
-  ReindexFeatures();
+  ReindexFeatures(&ctx);
   info_.SynchronizeNumberOfColumns();
 
   if (adapter->NumRows() == kAdapterUnknownSize) {
-    using IteratorAdapterT
-      = IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
+    using IteratorAdapterT =
+        IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
     // If AdapterT is either IteratorAdapter or FileAdapter type, use the total batch size to
     // determine the correct number of rows, as offset_vec may be too short
-    if (std::is_same<AdapterT, IteratorAdapterT>::value
-        || std::is_same<AdapterT, FileAdapter>::value) {
+    if (std::is_same<AdapterT, IteratorAdapterT>::value ||
+        std::is_same<AdapterT, FileAdapter>::value) {
       info_.num_row_ = total_batch_size;
       // Ensure offset_vec.size() - 1 == [number of rows]
       while (offset_vec.size() - 1 < total_batch_size) {
@@ -265,9 +309,11 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   info_.num_nonzero_ = data_vec.size();
 
   // Sort the index for row partitioners used by variuos tree methods.
-  if (!sparse_page_->IsIndicesSorted(this->ctx_.Threads())) {
-    sparse_page_->SortIndices(this->ctx_.Threads());
+  if (!sparse_page_->IsIndicesSorted(ctx.Threads())) {
+    sparse_page_->SortIndices(ctx.Threads());
   }
+
+  this->fmat_ctx_ = ctx;
 }
 
 SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
@@ -280,12 +326,12 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
 }
 
 void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
-    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
-    int tmagic = kMagic;
-    fo->Write(tmagic);
-    info_.SaveBinary(fo.get());
-    fo->Write(sparse_page_->offset.HostVector());
-    fo->Write(sparse_page_->data.HostVector());
+  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
+  int tmagic = kMagic;
+  fo->Write(tmagic);
+  info_.SaveBinary(fo.get());
+  fo->Write(sparse_page_->offset.HostVector());
+  fo->Write(sparse_page_->data.HostVector());
 }
 
 template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread,
@@ -305,14 +351,14 @@ template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing,
 template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread,
                                       DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(
-    IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>
-        *adapter,
+    IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
     float missing, int nthread, DataSplitMode data_split_mode);
 
 template <>
 SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
                              DataSplitMode data_split_mode) {
-    ctx_.nthread = nthread;
+  Context ctx;
+  ctx.nthread = nthread;
 
   auto& offset_vec = sparse_page_->offset.HostVector();
   auto& data_vec = sparse_page_->data.HostVector();
@@ -326,7 +372,7 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
     size_t num_elements = 0;
     size_t num_rows = 0;
     // Import Arrow RecordBatches
-#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx_.Threads())
+#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx.Threads())
     for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
       num_elements += batches[i]->Import(missing);
       num_rows += batches[i]->Size();
@@ -348,7 +394,7 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
     data_vec.resize(total_elements);
     offset_vec.resize(total_batch_size + 1);
     // Copy data into DMatrix
-#pragma omp parallel num_threads(ctx_.Threads())
+#pragma omp parallel num_threads(ctx.Threads())
     {
 #pragma omp for nowait
       for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
@@ -372,12 +418,14 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
   // Synchronise worker columns
   info_.num_col_ = adapter->NumColumns();
   info_.data_split_mode = data_split_mode;
-  ReindexFeatures();
+  ReindexFeatures(&ctx);
   info_.SynchronizeNumberOfColumns();
 
   info_.num_row_ = total_batch_size;
   info_.num_nonzero_ = data_vec.size();
   CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
+
+  fmat_ctx_ = ctx;
 }
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index fc09f52c4..b2be701d5 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -1,12 +1,14 @@
-/*!
- * Copyright 2019-2021 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  * \file simple_dmatrix.cu
  */
 #include <thrust/copy.h>
-#include <xgboost/data.h>
+
+#include "device_adapter.cuh"  // for CurrentDevice
 #include "simple_dmatrix.cuh"
 #include "simple_dmatrix.h"
-#include "device_adapter.cuh"
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"
 
 namespace xgboost {
 namespace data {
@@ -15,7 +17,7 @@ namespace data {
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
 template <typename AdapterT>
-SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread*/,
+SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthread,
                              DataSplitMode data_split_mode) {
   CHECK(data_split_mode != DataSplitMode::kCol)
       << "Column-wise data split is currently not supported on the GPU.";
@@ -24,6 +26,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
   CHECK_GE(device, 0);
   dh::safe_cuda(cudaSetDevice(device));
 
+  Context ctx;
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
+
   CHECK(adapter->NumRows() != kAdapterUnknownSize);
   CHECK(adapter->NumColumns() != kAdapterUnknownSize);
 
@@ -33,13 +38,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int32_t /*nthread
   // Enforce single batch
   CHECK(!adapter->Next());
 
-  info_.num_nonzero_ =
-      CopyToSparsePage(adapter->Value(), device, missing, sparse_page_.get());
+  info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), device, missing, sparse_page_.get());
   info_.num_col_ = adapter->NumColumns();
   info_.num_row_ = adapter->NumRows();
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
   info_.SynchronizeNumberOfColumns();
+
+  this->fmat_ctx_ = ctx;
 }
 
 template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
index 853e765af..56685c1e6 100644
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -32,7 +32,7 @@ class SimpleDMatrix : public DMatrix {
 
   MetaInfo& Info() override;
   const MetaInfo& Info() const override;
-  Context const* Ctx() const override { return &ctx_; }
+  Context const* Ctx() const override { return &fmat_ctx_; }
 
   bool SingleColBlock() const override { return true; }
   DMatrix* Slice(common::Span<int32_t const> ridxs) override;
@@ -43,11 +43,11 @@ class SimpleDMatrix : public DMatrix {
 
  protected:
   BatchSet<SparsePage> GetRowBatches() override;
-  BatchSet<CSCPage> GetColumnBatches() override;
-  BatchSet<SortedCSCPage> GetSortedColumnBatches() override;
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
-  BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) override;
-  BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) override;
+  BatchSet<CSCPage> GetColumnBatches(Context const* ctx) override;
+  BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const* ctx) override;
+  BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, const BatchParam& param) override;
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx, const BatchParam& param) override;
+  BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) override;
 
   MetaInfo info_;
   // Primary storage type
@@ -69,10 +69,11 @@ class SimpleDMatrix : public DMatrix {
    * starting from 0. However, all the algorithms assume the features are globally indexed, so we
    * reindex the features based on the offset needed to obtain the global view.
    */
-  void ReindexFeatures();
+  void ReindexFeatures(Context const* ctx);
 
  private:
-  Context ctx_;
+  // Context used only for DMatrix initialization.
+  Context fmat_ctx_;
 };
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 5e5b622af..f84fa8c01 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -1,6 +1,7 @@
-/*!
- * Copyright 2014-2022 by Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
  * \file sparse_page_dmatrix.cc
+ *
  * \brief The external memory version of Page Iterator.
  * \author Tianqi Chen
  */
@@ -8,11 +9,10 @@
 
 #include "../collective/communicator-inl.h"
 #include "./simple_batch_iterator.h"
+#include "batch_utils.h"  // for RegenGHist
 #include "gradient_index.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 MetaInfo &SparsePageDMatrix::Info() { return info_; }
 
 const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
@@ -46,7 +46,9 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
                                      int32_t nthreads, std::string cache_prefix)
     : proxy_{proxy_handle}, iter_{iter_handle}, reset_{reset}, next_{next}, missing_{missing},
       cache_prefix_{std::move(cache_prefix)} {
-  ctx_.nthread = nthreads;
+  Context ctx;
+  ctx.nthread = nthreads;
+
   cache_prefix_ = cache_prefix_.empty() ? "DMatrix" : cache_prefix_;
   if (collective::IsDistributed()) {
     cache_prefix_ += ("-r" + std::to_string(collective::GetRank()));
@@ -81,7 +83,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
 
   // the proxy is iterated together with the sparse page source so we can obtain all
   // information in 1 pass.
-  for (auto const &page : this->GetRowBatchesImpl()) {
+  for (auto const &page : this->GetRowBatchesImpl(&ctx)) {
     this->info_.Extend(std::move(proxy->Info()), false, false);
     n_features = std::max(n_features, num_cols());
     n_samples += num_rows();
@@ -98,9 +100,11 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
 
   info_.SynchronizeNumberOfColumns();
   CHECK_NE(info_.num_col_, 0);
+
+  fmat_ctx_ = ctx;
 }
 
-void SparsePageDMatrix::InitializeSparsePage() {
+void SparsePageDMatrix::InitializeSparsePage(Context const *ctx) {
   auto id = MakeCache(this, ".row.page", cache_prefix_, &cache_info_);
   // Don't use proxy DMatrix once this is already initialized, this allows users to
   // release the iterator and data.
@@ -110,33 +114,33 @@ void SparsePageDMatrix::InitializeSparsePage() {
     return;
   }
 
-  auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{
-      iter_, reset_, next_};
+  auto iter = DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_, reset_, next_};
   DMatrixProxy *proxy = MakeProxy(proxy_);
   sparse_page_source_.reset();  // clear before creating new one to prevent conflicts.
-  sparse_page_source_ = std::make_shared<SparsePageSource>(
-      iter, proxy, this->missing_, this->ctx_.Threads(), this->info_.num_col_,
-      this->n_batches_, cache_info_.at(id));
+  sparse_page_source_ = std::make_shared<SparsePageSource>(iter, proxy, this->missing_,
+                                                           ctx->Threads(), this->info_.num_col_,
+                                                           this->n_batches_, cache_info_.at(id));
 }
 
-BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl() {
-  this->InitializeSparsePage();
+BatchSet<SparsePage> SparsePageDMatrix::GetRowBatchesImpl(Context const* ctx) {
+  this->InitializeSparsePage(ctx);
   auto begin_iter = BatchIterator<SparsePage>(sparse_page_source_);
   return BatchSet<SparsePage>(BatchIterator<SparsePage>(begin_iter));
 }
 
 BatchSet<SparsePage> SparsePageDMatrix::GetRowBatches() {
-  return this->GetRowBatchesImpl();
+  // Use context from initialization for the default row page.
+  return this->GetRowBatchesImpl(&fmat_ctx_);
 }
 
-BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches() {
+BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches(Context const *ctx) {
   auto id = MakeCache(this, ".col.page", cache_prefix_, &cache_info_);
   CHECK_NE(this->Info().num_col_, 0);
-  this->InitializeSparsePage();
+  this->InitializeSparsePage(ctx);
   if (!column_source_) {
-    column_source_ = std::make_shared<CSCPageSource>(
-        this->missing_, this->ctx_.Threads(), this->Info().num_col_,
-        this->n_batches_, cache_info_.at(id), sparse_page_source_);
+    column_source_ =
+        std::make_shared<CSCPageSource>(this->missing_, ctx->Threads(), this->Info().num_col_,
+                                        this->n_batches_, cache_info_.at(id), sparse_page_source_);
   } else {
     column_source_->Reset();
   }
@@ -144,14 +148,14 @@ BatchSet<CSCPage> SparsePageDMatrix::GetColumnBatches() {
   return BatchSet<CSCPage>(BatchIterator<CSCPage>(begin_iter));
 }
 
-BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches() {
+BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const *ctx) {
   auto id = MakeCache(this, ".sorted.col.page", cache_prefix_, &cache_info_);
   CHECK_NE(this->Info().num_col_, 0);
-  this->InitializeSparsePage();
+  this->InitializeSparsePage(ctx);
   if (!sorted_column_source_) {
     sorted_column_source_ = std::make_shared<SortedCSCPageSource>(
-        this->missing_, this->ctx_.Threads(), this->Info().num_col_,
-        this->n_batches_, cache_info_.at(id), sparse_page_source_);
+        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
+        sparse_page_source_);
   } else {
     sorted_column_source_->Reset();
   }
@@ -159,27 +163,27 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches() {
   return BatchSet<SortedCSCPage>(BatchIterator<SortedCSCPage>(begin_iter));
 }
 
-BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam &param) {
+BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
+                                                               const BatchParam &param) {
   CHECK_GE(param.max_bin, 2);
   auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
-  this->InitializeSparsePage();
-  if (!cache_info_.at(id)->written || RegenGHist(batch_param_, param)) {
+  this->InitializeSparsePage(ctx);
+  if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
     cache_info_.erase(id);
     MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
     LOG(INFO) << "Generating new Gradient Index.";
     // Use sorted sketch for approx.
     auto sorted_sketch = param.regen;
-    auto cuts =
-        common::SketchOnDMatrix(this, param.max_bin, ctx_.Threads(), sorted_sketch, param.hess);
-    this->InitializeSparsePage();  // reset after use.
+    auto cuts = common::SketchOnDMatrix(ctx, this, param.max_bin, sorted_sketch, param.hess);
+    this->InitializeSparsePage(ctx);  // reset after use.
 
     batch_param_ = param;
     ghist_index_source_.reset();
     CHECK_NE(cuts.Values().size(), 0);
     auto ft = this->info_.feature_types.ConstHostSpan();
     ghist_index_source_.reset(new GradientIndexPageSource(
-        this->missing_, this->ctx_.Threads(), this->Info().num_col_, this->n_batches_,
-        cache_info_.at(id), param, std::move(cuts), this->IsDense(), ft, sparse_page_source_));
+        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
+        param, std::move(cuts), this->IsDense(), ft, sparse_page_source_));
   } else {
     CHECK(ghist_index_source_);
     ghist_index_source_->Reset();
@@ -189,11 +193,10 @@ BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(const BatchParam
 }
 
 #if !defined(XGBOOST_USE_CUDA)
-BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam &) {
+BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const *, const BatchParam &) {
   common::AssertGPUSupport();
   auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
   return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index b36a0e2a3..0a4cde43d 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -1,42 +1,40 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023 by XGBoost contributors
  */
-#include "sparse_page_source.h"
 #include "../common/hist_util.cuh"
+#include "batch_utils.h"  // for CheckEmpty, RegenGHist
 #include "ellpack_page.cuh"
 #include "sparse_page_dmatrix.h"
+#include "sparse_page_source.h"
 
-namespace xgboost {
-namespace data {
-BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& param) {
-  CHECK_GE(param.gpu_id, 0);
+namespace xgboost::data {
+BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
+                                                           const BatchParam& param) {
+  CHECK(ctx->IsCUDA());
   CHECK_GE(param.max_bin, 2);
-  if (!(batch_param_ != BatchParam{})) {
-    CHECK(param != BatchParam{}) << "Batch parameter is not initialized.";
-  }
+  detail::CheckEmpty(batch_param_, param);
   auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
   size_t row_stride = 0;
-  this->InitializeSparsePage();
-  if (!cache_info_.at(id)->written || RegenGHist(batch_param_, param)) {
+  this->InitializeSparsePage(ctx);
+  if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
     // reinitialize the cache
     cache_info_.erase(id);
     MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
     std::unique_ptr<common::HistogramCuts> cuts;
-    cuts.reset(new common::HistogramCuts{
-        common::DeviceSketch(param.gpu_id, this, param.max_bin, 0)});
-    this->InitializeSparsePage();  // reset after use.
+    cuts.reset(
+        new common::HistogramCuts{common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0)});
+    this->InitializeSparsePage(ctx);  // reset after use.
 
     row_stride = GetRowStride(this);
-    this->InitializeSparsePage();  // reset after use.
+    this->InitializeSparsePage(ctx);  // reset after use.
     CHECK_NE(row_stride, 0);
     batch_param_ = param;
 
     auto ft = this->info_.feature_types.ConstDeviceSpan();
     ellpack_page_source_.reset();  // release resources.
     ellpack_page_source_.reset(new EllpackPageSource(
-        this->missing_, this->ctx_.Threads(), this->Info().num_col_,
-        this->n_batches_, cache_info_.at(id), param, std::move(cuts),
-        this->IsDense(), row_stride, ft, sparse_page_source_));
+        this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id));
   } else {
     CHECK(sparse_page_source_);
     ellpack_page_source_->Reset();
@@ -45,5 +43,4 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(const BatchParam& par
   auto begin_iter = BatchIterator<EllpackPage>(ellpack_page_source_);
   return BatchSet<EllpackPage>(BatchIterator<EllpackPage>(begin_iter));
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h
index aa0be6984..02aa9a5c0 100644
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2021 by Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
  * \file sparse_page_dmatrix.h
  * \brief External-memory version of DMatrix.
  * \author Tianqi Chen
@@ -9,12 +9,13 @@
 
 #include <xgboost/data.h>
 #include <xgboost/logging.h>
+
 #include <algorithm>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
-#include <map>
 
 #include "ellpack_page_source.h"
 #include "gradient_index_page_source.h"
@@ -69,19 +70,18 @@ class SparsePageDMatrix : public DMatrix {
   XGDMatrixCallbackNext *next_;
 
   float missing_;
-  Context ctx_;
+  Context fmat_ctx_;
   std::string cache_prefix_;
-  uint32_t n_batches_ {0};
+  uint32_t n_batches_{0};
   // sparse page is the source to other page types, we make a special member function.
-  void InitializeSparsePage();
+  void InitializeSparsePage(Context const *ctx);
   // Non-virtual version that can be used in constructor
-  BatchSet<SparsePage> GetRowBatchesImpl();
+  BatchSet<SparsePage> GetRowBatchesImpl(Context const *ctx);
 
  public:
-  explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy,
-                             DataIterResetCallback *reset,
-                             XGDMatrixCallbackNext *next, float missing,
-                             int32_t nthreads, std::string cache_prefix);
+  explicit SparsePageDMatrix(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
+                             XGDMatrixCallbackNext *next, float missing, int32_t nthreads,
+                             std::string cache_prefix);
 
   ~SparsePageDMatrix() override {
     // Clear out all resources before deleting the cache file.
@@ -98,9 +98,9 @@ class SparsePageDMatrix : public DMatrix {
     }
   }
 
-  MetaInfo& Info() override;
-  const MetaInfo& Info() const override;
-  Context const* Ctx() const override { return &ctx_; }
+  MetaInfo &Info() override;
+  const MetaInfo &Info() const override;
+  Context const *Ctx() const override { return &fmat_ctx_; }
 
   bool SingleColBlock() const override { return false; }
   DMatrix *Slice(common::Span<int32_t const>) override {
@@ -114,11 +114,11 @@ class SparsePageDMatrix : public DMatrix {
 
  private:
   BatchSet<SparsePage> GetRowBatches() override;
-  BatchSet<CSCPage> GetColumnBatches() override;
-  BatchSet<SortedCSCPage> GetSortedColumnBatches() override;
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
-  BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam&) override;
-  BatchSet<ExtSparsePage> GetExtBatches(BatchParam const &) override {
+  BatchSet<CSCPage> GetColumnBatches(Context const *ctx) override;
+  BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const *ctx) override;
+  BatchSet<EllpackPage> GetEllpackBatches(Context const *ctx, const BatchParam &param) override;
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const *ctx, const BatchParam &) override;
+  BatchSet<ExtSparsePage> GetExtBatches(Context const *, BatchParam const &) override {
     LOG(FATAL) << "Can not obtain a single CSR page for external memory DMatrix";
     return BatchSet<ExtSparsePage>(BatchIterator<ExtSparsePage>(nullptr));
   }
@@ -141,9 +141,8 @@ inline std::string MakeId(std::string prefix, SparsePageDMatrix *ptr) {
   return prefix + "-" + ss.str();
 }
 
-inline std::string
-MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
-          std::map<std::string, std::shared_ptr<Cache>> *out) {
+inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::string prefix,
+                             std::map<std::string, std::shared_ptr<Cache>> *out) {
   auto &cache_info = *out;
   auto name = MakeId(prefix, ptr);
   auto id = name + format;
diff --git a/src/linear/coordinate_common.h b/src/linear/coordinate_common.h
index f61c423f0..f08856bd1 100644
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018 by Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  * \author Rory Mitchell
  */
 #pragma once
@@ -78,11 +78,12 @@ inline double CoordinateDeltaBias(double sum_grad, double sum_hess) {
  *
  * \return  The gradient and diagonal Hessian entry for a given feature.
  */
-inline std::pair<double, double> GetGradient(int group_idx, int num_group, int fidx,
-                                             const std::vector<GradientPair> &gpair,
+inline std::pair<double, double> GetGradient(Context const *ctx, int group_idx, int num_group,
+                                             bst_feature_t fidx,
+                                             std::vector<GradientPair> const &gpair,
                                              DMatrix *p_fmat) {
   double sum_grad = 0.0, sum_hess = 0.0;
-  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+  for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
     auto page = batch.GetView();
     auto col = page[fidx];
     const auto ndata = static_cast<bst_omp_uint>(col.size());
@@ -115,7 +116,7 @@ inline std::pair<double, double> GetGradientParallel(Context const *ctx, int gro
   std::vector<double> sum_grad_tloc(ctx->Threads(), 0.0);
   std::vector<double> sum_hess_tloc(ctx->Threads(), 0.0);
 
-  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+  for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
     auto page = batch.GetView();
     auto col = page[fidx];
     const auto ndata = static_cast<bst_omp_uint>(col.size());
@@ -177,16 +178,16 @@ inline std::pair<double, double> GetBiasGradientParallel(int group_idx, int num_
  * \param in_gpair  The gradient vector to be updated.
  * \param p_fmat    The input feature matrix.
  */
-inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
-                                   float dw, std::vector<GradientPair> *in_gpair,
-                                   DMatrix *p_fmat, int32_t n_threads) {
+inline void UpdateResidualParallel(Context const *ctx, bst_feature_t fidx, int group_idx,
+                                   int num_group, float dw, std::vector<GradientPair> *in_gpair,
+                                   DMatrix *p_fmat) {
   if (dw == 0.0f) return;
-  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+  for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
     auto page = batch.GetView();
     auto col = page[fidx];
     // update grad value
     const auto num_row = static_cast<bst_omp_uint>(col.size());
-    common::ParallelFor(num_row, n_threads, [&](auto j) {
+    common::ParallelFor(num_row, ctx->Threads(), [&](auto j) {
       GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
       if (p.GetHess() < 0.0f) return;
       p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
@@ -203,12 +204,12 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
  * \param in_gpair  The gradient vector to be updated.
  * \param p_fmat    The input feature matrix.
  */
-inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias,
-                                       std::vector<GradientPair> *in_gpair, DMatrix *p_fmat,
-                                       int32_t n_threads) {
+inline void UpdateBiasResidualParallel(Context const *ctx, int group_idx, int num_group,
+                                       float dbias, std::vector<GradientPair> *in_gpair,
+                                       DMatrix *p_fmat) {
   if (dbias == 0.0f) return;
   const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
-  common::ParallelFor(ndata, n_threads, [&](auto i) {
+  common::ParallelFor(ndata, ctx->Threads(), [&](auto i) {
     GradientPair &g = (*in_gpair)[i * num_group + group_idx];
     if (g.GetHess() < 0.0f) return;
     g += GradientPair(g.GetHess() * dbias, 0);
@@ -220,18 +221,16 @@ inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias
  *        in coordinate descent algorithms.
  */
 class FeatureSelector {
- protected:
-  int32_t n_threads_{-1};
-
  public:
-  explicit FeatureSelector(int32_t n_threads) : n_threads_{n_threads} {}
+  FeatureSelector() = default;
   /*! \brief factory method */
-  static FeatureSelector *Create(int choice, int32_t n_threads);
+  static FeatureSelector *Create(int choice);
   /*! \brief virtual destructor */
   virtual ~FeatureSelector() = default;
   /**
    * \brief Setting up the selector state prior to looping through features.
    *
+   * \param ctx    The booster context.
    * \param model  The model.
    * \param gpair  The gpair.
    * \param p_fmat The feature matrix.
@@ -239,13 +238,12 @@ class FeatureSelector {
    * \param lambda Regularisation lambda.
    * \param param  A parameter with algorithm-dependent use.
    */
-  virtual void Setup(const gbm::GBLinearModel &,
-                     const std::vector<GradientPair> &,
-                     DMatrix *,
-                     float , float , int ) {}
+  virtual void Setup(Context const *, const gbm::GBLinearModel &,
+                     const std::vector<GradientPair> &, DMatrix *, float, float, int) {}
   /**
    * \brief Select next coordinate to update.
    *
+   * \param ctx       Booster context
    * \param iteration The iteration in a loop through features
    * \param model     The model.
    * \param group_idx Zero-based index of the group.
@@ -256,11 +254,9 @@ class FeatureSelector {
    *
    * \return  The index of the selected feature. -1 indicates none selected.
    */
-  virtual int NextFeature(int iteration,
-                          const gbm::GBLinearModel &model,
-                          int group_idx,
-                          const std::vector<GradientPair> &gpair,
-                          DMatrix *p_fmat, float alpha, float lambda) = 0;
+  virtual int NextFeature(Context const *ctx, int iteration, const gbm::GBLinearModel &model,
+                          int group_idx, const std::vector<GradientPair> &gpair, DMatrix *p_fmat,
+                          float alpha, float lambda) = 0;
 };
 
 /**
@@ -269,9 +265,8 @@ class FeatureSelector {
 class CyclicFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  int NextFeature(int iteration, const gbm::GBLinearModel &model,
-                  int , const std::vector<GradientPair> &,
-                  DMatrix *, float, float) override {
+  int NextFeature(Context const *, int iteration, const gbm::GBLinearModel &model, int,
+                  const std::vector<GradientPair> &, DMatrix *, float, float) override {
     return iteration % model.learner_model_param->num_feature;
   }
 };
@@ -283,8 +278,7 @@ class CyclicFeatureSelector : public FeatureSelector {
 class ShuffleFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  void Setup(const gbm::GBLinearModel &model,
-             const std::vector<GradientPair>&,
+  void Setup(Context const *, const gbm::GBLinearModel &model, const std::vector<GradientPair> &,
              DMatrix *, float, float, int) override {
     if (feat_index_.size() == 0) {
       feat_index_.resize(model.learner_model_param->num_feature);
@@ -293,9 +287,8 @@ class ShuffleFeatureSelector : public FeatureSelector {
     std::shuffle(feat_index_.begin(), feat_index_.end(), common::GlobalRandom());
   }
 
-  int NextFeature(int iteration, const gbm::GBLinearModel &model,
-                  int, const std::vector<GradientPair> &,
-                  DMatrix *, float, float) override {
+  int NextFeature(Context const *, int iteration, const gbm::GBLinearModel &model, int,
+                  const std::vector<GradientPair> &, DMatrix *, float, float) override {
     return feat_index_[iteration % model.learner_model_param->num_feature];
   }
 
@@ -310,9 +303,8 @@ class ShuffleFeatureSelector : public FeatureSelector {
 class RandomFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  int NextFeature(int, const gbm::GBLinearModel &model,
-                  int, const std::vector<GradientPair> &,
-                  DMatrix *, float, float) override {
+  int NextFeature(Context const *, int, const gbm::GBLinearModel &model, int,
+                  const std::vector<GradientPair> &, DMatrix *, float, float) override {
     return common::GlobalRandom()() % model.learner_model_param->num_feature;
   }
 };
@@ -329,8 +321,7 @@ class RandomFeatureSelector : public FeatureSelector {
 class GreedyFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  void Setup(const gbm::GBLinearModel &model,
-             const std::vector<GradientPair> &,
+  void Setup(Context const *, const gbm::GBLinearModel &model, const std::vector<GradientPair> &,
              DMatrix *, float, float, int param) override {
     top_k_ = static_cast<bst_uint>(param);
     const bst_uint ngroup = model.learner_model_param->num_output_group;
@@ -344,7 +335,7 @@ class GreedyFeatureSelector : public FeatureSelector {
     }
   }
 
-  int NextFeature(int, const gbm::GBLinearModel &model,
+  int NextFeature(Context const* ctx, int, const gbm::GBLinearModel &model,
                   int group_idx, const std::vector<GradientPair> &gpair,
                   DMatrix *p_fmat, float alpha, float lambda) override {
     // k-th selected feature for a group
@@ -356,9 +347,9 @@ class GreedyFeatureSelector : public FeatureSelector {
     const bst_omp_uint nfeat = model.learner_model_param->num_feature;
     // Calculate univariate gradient sums
     std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+    for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
       auto page = batch.GetView();
-      common::ParallelFor(nfeat, this->n_threads_, [&](bst_omp_uint i) {
+      common::ParallelFor(nfeat, ctx->Threads(), [&](bst_omp_uint i) {
         const auto col = page[i];
         const bst_uint ndata = col.size();
         auto &sums = gpair_sums_[group_idx * nfeat + i];
@@ -406,9 +397,10 @@ class GreedyFeatureSelector : public FeatureSelector {
 class ThriftyFeatureSelector : public FeatureSelector {
  public:
   using FeatureSelector::FeatureSelector;
-  void Setup(const gbm::GBLinearModel &model,
-             const std::vector<GradientPair> &gpair,
-             DMatrix *p_fmat, float alpha, float lambda, int param) override {
+
+  void Setup(Context const *ctx, const gbm::GBLinearModel &model,
+             const std::vector<GradientPair> &gpair, DMatrix *p_fmat, float alpha, float lambda,
+             int param) override {
     top_k_ = static_cast<bst_uint>(param);
     if (param <= 0) top_k_ = std::numeric_limits<bst_uint>::max();
     const bst_uint ngroup = model.learner_model_param->num_output_group;
@@ -422,10 +414,10 @@ class ThriftyFeatureSelector : public FeatureSelector {
     }
     // Calculate univariate gradient sums
     std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+    for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx)) {
       auto page = batch.GetView();
       // column-parallel is usually fastaer than row-parallel
-      common::ParallelFor(nfeat, this->n_threads_, [&](auto i) {
+      common::ParallelFor(nfeat, ctx->Threads(), [&](auto i) {
         const auto col = page[i];
         const bst_uint ndata = col.size();
         for (bst_uint gid = 0u; gid < ngroup; ++gid) {
@@ -462,9 +454,8 @@ class ThriftyFeatureSelector : public FeatureSelector {
     }
   }
 
-  int NextFeature(int, const gbm::GBLinearModel &model,
-                  int group_idx, const std::vector<GradientPair> &,
-                  DMatrix *, float, float) override {
+  int NextFeature(Context const *, int, const gbm::GBLinearModel &model, int group_idx,
+                  const std::vector<GradientPair> &, DMatrix *, float, float) override {
     // k-th selected feature for a group
     auto k = counter_[group_idx]++;
     // stop after either reaching top-N or going through all the features in a group
@@ -482,18 +473,18 @@ class ThriftyFeatureSelector : public FeatureSelector {
   std::vector<std::pair<double, double>> gpair_sums_;
 };
 
-inline FeatureSelector *FeatureSelector::Create(int choice, int32_t n_threads) {
+inline FeatureSelector *FeatureSelector::Create(int choice) {
   switch (choice) {
     case kCyclic:
-      return new CyclicFeatureSelector(n_threads);
+      return new CyclicFeatureSelector;
     case kShuffle:
-      return new ShuffleFeatureSelector(n_threads);
+      return new ShuffleFeatureSelector;
     case kThrifty:
-      return new ThriftyFeatureSelector(n_threads);
+      return new ThriftyFeatureSelector;
     case kGreedy:
-      return new GreedyFeatureSelector(n_threads);
+      return new GreedyFeatureSelector;
     case kRandom:
-      return new RandomFeatureSelector(n_threads);
+      return new RandomFeatureSelector;
     default:
       LOG(FATAL) << "unknown coordinate selector: " << choice;
   }
diff --git a/src/linear/updater_coordinate.cc b/src/linear/updater_coordinate.cc
index 29ba5451b..84f15d706 100644
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018 by Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  * \author Rory Mitchell
  */
 
@@ -30,7 +30,7 @@ class CoordinateUpdater : public LinearUpdater {
       tparam_.UpdateAllowUnknown(args)
     };
     cparam_.UpdateAllowUnknown(rest);
-    selector_.reset(FeatureSelector::Create(tparam_.feature_selector, ctx_->Threads()));
+    selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
     monitor_.Init("CoordinateUpdater");
   }
 
@@ -56,19 +56,17 @@ class CoordinateUpdater : public LinearUpdater {
       auto dbias = static_cast<float>(tparam_.learning_rate *
                                       CoordinateDeltaBias(grad.first, grad.second));
       model->Bias()[group_idx] += dbias;
-      UpdateBiasResidualParallel(group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat,
-                                 ctx_->Threads());
+      UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
     }
     // prepare for updating the weights
-    selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
-                    tparam_.reg_alpha_denorm,
-                    tparam_.reg_lambda_denorm, cparam_.top_k);
+    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
+                     tparam_.reg_lambda_denorm, cparam_.top_k);
     // update weights
     for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
       for (unsigned i = 0U; i < model->learner_model_param->num_feature; i++) {
-        int fidx = selector_->NextFeature
-          (i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
-           tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
+        int fidx =
+            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
+                                   tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
         if (fidx < 0) break;
         this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
       }
@@ -76,8 +74,8 @@ class CoordinateUpdater : public LinearUpdater {
     monitor_.Stop("UpdateFeature");
   }
 
-  inline void UpdateFeature(int fidx, int group_idx, std::vector<GradientPair> *in_gpair,
-                            DMatrix *p_fmat, gbm::GBLinearModel *model) {
+  void UpdateFeature(int fidx, int group_idx, std::vector<GradientPair> *in_gpair, DMatrix *p_fmat,
+                     gbm::GBLinearModel *model) {
     const int ngroup = model->learner_model_param->num_output_group;
     bst_float &w = (*model)[fidx][group_idx];
     auto gradient = GetGradientParallel(ctx_, group_idx, ngroup, fidx,
@@ -87,8 +85,7 @@ class CoordinateUpdater : public LinearUpdater {
         CoordinateDelta(gradient.first, gradient.second, w, tparam_.reg_alpha_denorm,
                         tparam_.reg_lambda_denorm));
     w += dw;
-    UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat,
-                           ctx_->Threads());
+    UpdateResidualParallel(ctx_, fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
   }
 
  private:
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index b63c1317e..7d658cf78 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -32,7 +32,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
   void Configure(Args const& args) override {
     tparam_.UpdateAllowUnknown(args);
     coord_param_.UpdateAllowUnknown(args);
-    selector_.reset(FeatureSelector::Create(tparam_.feature_selector, ctx_->Threads()));
+    selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
     monitor_.Init("GPUCoordinateUpdater");
   }
 
@@ -53,7 +53,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
 
     CHECK(p_fmat->SingleColBlock());
-    SparsePage const& batch = *(p_fmat->GetBatches<CSCPage>().begin());
+    SparsePage const &batch = *(p_fmat->GetBatches<CSCPage>(ctx_).begin());
     auto page = batch.GetView();
 
     if (IsEmpty()) {
@@ -112,16 +112,15 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     this->UpdateBias(model);
     monitor_.Stop("UpdateBias");
     // prepare for updating the weights
-    selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
-                     tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm,
-                     coord_param_.top_k);
+    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
+                     tparam_.reg_lambda_denorm, coord_param_.top_k);
     monitor_.Start("UpdateFeature");
     for (uint32_t group_idx = 0; group_idx < model->learner_model_param->num_output_group;
          ++group_idx) {
       for (auto i = 0U; i < model->learner_model_param->num_feature; i++) {
-        auto fidx = selector_->NextFeature(
-            i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
-            tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
+        auto fidx =
+            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
+                                   tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
         if (fidx < 0) break;
         this->UpdateFeature(fidx, group_idx, model);
       }
diff --git a/src/linear/updater_shotgun.cc b/src/linear/updater_shotgun.cc
index d8592f1cf..18b747f64 100644
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018 by Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  * \author Tianqi Chen, Rory Mitchell
  */
 
@@ -21,7 +21,7 @@ class ShotgunUpdater : public LinearUpdater {
       LOG(FATAL) << "Unsupported feature selector for shotgun updater.\n"
                  << "Supported options are: {cyclic, shuffle}";
     }
-    selector_.reset(FeatureSelector::Create(param_.feature_selector, ctx_->Threads()));
+    selector_.reset(FeatureSelector::Create(param_.feature_selector));
   }
   void LoadConfig(Json const& in) override {
     auto const& config = get<Object const>(in);
@@ -45,18 +45,17 @@ class ShotgunUpdater : public LinearUpdater {
       auto dbias = static_cast<bst_float>(param_.learning_rate *
                                CoordinateDeltaBias(grad.first, grad.second));
       model->Bias()[gid] += dbias;
-      UpdateBiasResidualParallel(gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat,
-                                 ctx_->Threads());
+      UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
     }
 
     // lock-free parallel updates of weights
-    selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
-                     param_.reg_alpha_denorm, param_.reg_lambda_denorm, 0);
-    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
+                     param_.reg_lambda_denorm, 0);
+    for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx_)) {
       auto page = batch.GetView();
       const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
       common::ParallelFor(nfeat, ctx_->Threads(), [&](auto i) {
-        int ii = selector_->NextFeature(i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
+        int ii = selector_->NextFeature(ctx_, i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
                                         param_.reg_alpha_denorm, param_.reg_lambda_denorm);
         if (ii < 0) return;
         const bst_uint fid = ii;
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index b3b4c5e80..aa8972989 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -634,7 +634,7 @@ class CPUPredictor : public Predictor {
     if (!p_fmat->PageExists<SparsePage>()) {
       std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
       auto ft = p_fmat->Info().feature_types.ConstHostVector();
-      for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>({})) {
+      for (auto const &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
         if (blocked) {
           PredictBatchByBlockOfRowsKernel<GHistIndexMatrixView, kBlockOfRowsSize>(
               GHistIndexMatrixView{batch, p_fmat->Info().num_col_, ft, workspace, n_threads}, model,
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 2439a277f..11662f9b8 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -706,7 +706,7 @@ class GPUPredictor : public xgboost::Predictor {
       }
     } else {
       size_t batch_offset = 0;
-      for (auto const& page : dmat->GetBatches<EllpackPage>(BatchParam{})) {
+      for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
         auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
         this->PredictInternal(
@@ -983,7 +983,7 @@ class GPUPredictor : public xgboost::Predictor {
         batch_offset += batch.Size();
       }
     } else {
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(BatchParam{})) {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         bst_row_t batch_offset = 0;
         EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
         size_t num_rows = batch.Size();
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 676497336..f22fa172f 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2021 by XGBoost Contributors
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
  */
 #include <thrust/functional.h>
 #include <thrust/random.h>
@@ -12,6 +12,7 @@
 #include <utility>
 
 #include "../../common/compressed_iterator.h"
+#include "../../common/cuda_context.cuh"  // for CUDAContext
 #include "../../common/random.h"
 #include "../param.h"
 #include "gradient_based_sampler.cuh"
@@ -147,25 +148,26 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
 
 NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
 
-GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
+GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
+                                       DMatrix* dmat) {
   return {dmat->Info().num_row_, page_, gpair};
 }
 
-ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl const* page,
-                                                   size_t n_rows,
-                                                   const BatchParam& batch_param)
-    : batch_param_(batch_param),
-      page_(new EllpackPageImpl(batch_param.gpu_id, page->Cuts(), page->is_dense,
-                                page->row_stride, n_rows)) {}
+ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
+                                                   size_t n_rows, BatchParam batch_param)
+    : batch_param_{std::move(batch_param)},
+      page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
+                                n_rows)) {}
 
-GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair> gpair,
+GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
+                                                     common::Span<GradientPair> gpair,
                                                      DMatrix* dmat) {
   if (!page_concatenated_) {
     // Concatenate all the external memory ELLPACK pages into a single in-memory page.
     size_t offset = 0;
-    for (auto& batch : dmat->GetBatches<EllpackPage>(batch_param_)) {
+    for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
-      size_t num_elements = page_->Copy(batch_param_.gpu_id, page, offset);
+      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
       offset += num_elements;
     }
     page_concatenated_ = true;
@@ -176,12 +178,13 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair>
 UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
     : page_(page), subsample_(subsample) {}
 
-GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
+GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                                            DMatrix* dmat) {
   // Set gradient pair to 0 with p = 1 - subsample
-  thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
-                     thrust::counting_iterator<size_t>(0),
-                     BernoulliTrial(common::GlobalRandom()(), subsample_),
-                     GradientPair());
+  auto cuctx = ctx->CUDACtx();
+  thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                     thrust::counting_iterator<std::size_t>(0),
+                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
   return {dmat->Info().num_row_, page_, gpair};
 }
 
@@ -192,7 +195,8 @@ ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
       subsample_(subsample),
       sample_row_index_(n_rows) {}
 
-GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientPair> gpair,
+GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
+                                                          common::Span<GradientPair> gpair,
                                                           DMatrix* dmat) {
   // Set gradient pair to 0 with p = 1 - subsample
   thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
@@ -216,18 +220,17 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
                     sample_row_index_.begin(),
                     ClearEmptyRows());
 
-  auto batch_iterator = dmat->GetBatches<EllpackPage>(batch_param_);
+  auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(
-      batch_param_.gpu_id, first_page->Cuts(), first_page->is_dense,
-                           first_page->row_stride, sample_rows));
+  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+                                  first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(batch_param_.gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@@ -242,18 +245,17 @@ GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
       threshold_(n_rows + 1, 0.0f),
       grad_sum_(n_rows, 0.0f) {}
 
-GradientBasedSample GradientBasedSampling::Sample(common::Span<GradientPair> gpair,
-                                                  DMatrix* dmat) {
+GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
+                                                  common::Span<GradientPair> gpair, DMatrix* dmat) {
+  auto cuctx = ctx->CUDACtx();
   size_t n_rows = dmat->Info().num_row_;
   size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
       gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
 
   // Perform Poisson sampling in place.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    thrust::counting_iterator<size_t>(0),
-                    dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_),
-                                    threshold_index,
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
+                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                     RandomWeight(common::GlobalRandom()())));
   return {n_rows, page_, gpair};
 }
@@ -268,7 +270,8 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
       grad_sum_(n_rows, 0.0f),
       sample_row_index_(n_rows) {}
 
-GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<GradientPair> gpair,
+GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
+                                                                common::Span<GradientPair> gpair,
                                                                 DMatrix* dmat) {
   size_t n_rows = dmat->Info().num_row_;
   size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
@@ -298,28 +301,25 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
                     sample_row_index_.begin(),
                     ClearEmptyRows());
 
-  auto batch_iterator = dmat->GetBatches<EllpackPage>(batch_param_);
+  auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(batch_param_.gpu_id, first_page->Cuts(),
-                                  first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(batch_param_.gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }
 
-GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
-                                           size_t n_rows,
-                                           const BatchParam& batch_param,
-                                           float subsample,
-                                           int sampling_method) {
+GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
+                                           size_t n_rows, const BatchParam& batch_param,
+                                           float subsample, int sampling_method) {
   monitor_.Init("gradient_based_sampler");
 
   bool is_sampling = subsample < 1.0;
@@ -346,7 +346,7 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
     }
   } else {
     if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(page, n_rows, batch_param));
+      strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
     } else {
       strategy_.reset(new NoSampling(page));
     }
@@ -354,10 +354,10 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
 }
 
 // Sample a DMatrix based on the given gradient pairs.
-GradientBasedSample GradientBasedSampler::Sample(common::Span<GradientPair> gpair,
-                                                 DMatrix* dmat) {
+GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
+                                                 common::Span<GradientPair> gpair, DMatrix* dmat) {
   monitor_.Start("Sample");
-  GradientBasedSample sample = strategy_->Sample(gpair, dmat);
+  GradientBasedSample sample = strategy_->Sample(ctx, gpair, dmat);
   monitor_.Stop("Sample");
   return sample;
 }
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh
index 5be6c71de..dafb98cfd 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -24,7 +24,8 @@ struct GradientBasedSample {
 class SamplingStrategy {
  public:
   /*! \brief Sample from a DMatrix based on the given gradient pairs. */
-  virtual GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) = 0;
+  virtual GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                                     DMatrix* dmat) = 0;
   virtual ~SamplingStrategy() = default;
 };
 
@@ -32,7 +33,8 @@ class SamplingStrategy {
 class NoSampling : public SamplingStrategy {
  public:
   explicit NoSampling(EllpackPageImpl const* page);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   EllpackPageImpl const* page_;
@@ -41,10 +43,10 @@ class NoSampling : public SamplingStrategy {
 /*! \brief No sampling in external memory mode. */
 class ExternalMemoryNoSampling : public SamplingStrategy {
  public:
-  ExternalMemoryNoSampling(EllpackPageImpl const* page,
-                           size_t n_rows,
-                           const BatchParam& batch_param);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
+                           BatchParam batch_param);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   BatchParam batch_param_;
@@ -56,7 +58,8 @@ class ExternalMemoryNoSampling : public SamplingStrategy {
 class UniformSampling : public SamplingStrategy {
  public:
   UniformSampling(EllpackPageImpl const* page, float subsample);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   EllpackPageImpl const* page_;
@@ -66,10 +69,9 @@ class UniformSampling : public SamplingStrategy {
 /*! \brief No sampling in external memory mode. */
 class ExternalMemoryUniformSampling : public SamplingStrategy {
  public:
-  ExternalMemoryUniformSampling(size_t n_rows,
-                                BatchParam batch_param,
-                                float subsample);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  ExternalMemoryUniformSampling(size_t n_rows, BatchParam batch_param, float subsample);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   BatchParam batch_param_;
@@ -82,11 +84,10 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
 /*! \brief Gradient-based sampling in in-memory mode.. */
 class GradientBasedSampling : public SamplingStrategy {
  public:
-  GradientBasedSampling(EllpackPageImpl const* page,
-                        size_t n_rows,
-                        const BatchParam& batch_param,
+  GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
                         float subsample);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   EllpackPageImpl const* page_;
@@ -98,10 +99,9 @@ class GradientBasedSampling : public SamplingStrategy {
 /*! \brief Gradient-based sampling in external memory mode.. */
 class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
  public:
-  ExternalMemoryGradientBasedSampling(size_t n_rows,
-                                      BatchParam batch_param,
-                                      float subsample);
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
+  ExternalMemoryGradientBasedSampling(size_t n_rows, BatchParam batch_param, float subsample);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
 
  private:
   BatchParam batch_param_;
@@ -124,14 +124,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
  */
 class GradientBasedSampler {
  public:
-  GradientBasedSampler(EllpackPageImpl const* page,
-                       size_t n_rows,
-                       const BatchParam& batch_param,
-                       float subsample,
-                       int sampling_method);
+  GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
+                       const BatchParam& batch_param, float subsample, int sampling_method);
 
   /*! \brief Sample from a DMatrix based on the given gradient pairs. */
-  GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
 
   /*! \brief Calculate the threshold used to normalize sampling probabilities. */
   static size_t CalculateThresholdIndex(common::Span<GradientPair> gpair,
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 148614a7e..f637427ad 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -66,7 +66,7 @@ class GloablApproxBuilder {
     partitioner_.clear();
     // Generating the GHistIndexMatrix is quite slow, is there a way to speed it up?
     for (auto const &page :
-         p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess, *task_))) {
+         p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess, *task_))) {
       if (n_total_bins == 0) {
         n_total_bins = page.cut.TotalBins();
         feature_values_ = page.cut;
@@ -97,7 +97,7 @@ class GloablApproxBuilder {
     std::vector<CPUExpandEntry> nodes{best};
     size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
       histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
                                    {}, gpair);
       i++;
@@ -148,7 +148,7 @@ class GloablApproxBuilder {
 
     size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
       histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
                                    nodes_to_build, nodes_to_sub, gpair);
       i++;
@@ -214,7 +214,8 @@ class GloablApproxBuilder {
 
       monitor_->Start("UpdatePosition");
       size_t page_id = 0;
-      for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
+      for (auto const &page :
+           p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
         partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
         page_id++;
       }
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 02edfa74a..bda9b4dfa 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -76,7 +76,7 @@ class ColMaker: public TreeUpdater {
     // Finds densities if we don't already have them
     if (column_densities_.empty()) {
       std::vector<size_t> column_size(dmat->Info().num_col_);
-      for (const auto &batch : dmat->GetBatches<SortedCSCPage>()) {
+      for (const auto &batch : dmat->GetBatches<SortedCSCPage>(ctx_)) {
         auto page = batch.GetView();
         for (auto i = 0u; i < batch.Size(); i++) {
           column_size[i] += page[i].size();
@@ -467,7 +467,7 @@ class ColMaker: public TreeUpdater {
       auto evaluator = tree_evaluator_.GetEvaluator();
 
       auto feat_set = column_sampler_.GetFeatureSet(depth);
-      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
+      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
         this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat);
       }
       // after this each thread's stemp will get the best candidates, aggregate results
@@ -546,7 +546,7 @@ class ColMaker: public TreeUpdater {
       }
       std::sort(fsplits.begin(), fsplits.end());
       fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
+      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
         auto page = batch.GetView();
         for (auto fid : fsplits) {
           auto col = page[fid];
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 2d1b7a24d..c14e19db1 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -218,7 +218,7 @@ struct GPUHistMakerDevice {
         column_sampler(column_sampler_seed),
         interaction_constraints(param, n_features),
         batch_param(std::move(_batch_param)) {
-    sampler.reset(new GradientBasedSampler(page, _n_rows, batch_param, param.subsample,
+    sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
                                            param.sampling_method));
     if (!param.monotone_constraints.empty()) {
       // Copy assigning an empty vector causes an exception in MSVC debug builds
@@ -258,7 +258,7 @@ struct GPUHistMakerDevice {
     dh::safe_cuda(cudaMemcpyAsync(
         d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
         dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
-    auto sample = sampler->Sample(dh::ToSpan(d_gpair), dmat);
+    auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
     page = sample.page;
     gpair = sample.gpair;
 
@@ -808,11 +808,8 @@ class GPUHistMaker : public TreeUpdater {
     uint32_t column_sampling_seed = common::GlobalRandom()();
     collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
 
-    BatchParam batch_param{
-        ctx_->gpu_id,
-        param->max_bin,
-    };
-    auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
+    auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
+    auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     info_->feature_types.SetDevice(ctx_->gpu_id);
     maker.reset(new GPUHistMakerDevice<GradientSumT>(
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 749e799a8..f0dd3dd12 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -134,7 +134,7 @@ class MultiTargetHistBuilder {
                       std::vector<MultiExpandEntry> const &applied) {
     monitor_->Start(__func__);
     std::size_t page_id{0};
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(this->param_))) {
       this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
       page_id++;
     }
@@ -152,7 +152,7 @@ class MultiTargetHistBuilder {
     std::size_t page_id = 0;
     bst_bin_t n_total_bins = 0;
     partitioner_.clear();
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       if (n_total_bins == 0) {
         n_total_bins = page.cut.TotalBins();
       } else {
@@ -206,7 +206,7 @@ class MultiTargetHistBuilder {
     std::vector<MultiExpandEntry> nodes{best};
     std::size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       for (bst_target_t t{0}; t < n_targets; ++t) {
         auto t_gpair = gpair.Slice(linalg::All(), t);
         histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
@@ -225,7 +225,7 @@ class MultiTargetHistBuilder {
     for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
       hists.push_back(&histogram_builder_[t].Histogram());
     }
-    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
       break;
     }
@@ -263,7 +263,7 @@ class MultiTargetHistBuilder {
 
     std::size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
         auto t_gpair = gpair.Slice(linalg::All(), t);
         // Make sure the gradient matrix is f-order.
@@ -283,7 +283,7 @@ class MultiTargetHistBuilder {
     for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
       hists.push_back(&histogram_builder_[t].Histogram());
     }
-    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
       break;
     }
@@ -383,7 +383,7 @@ class HistBuilder {
     std::size_t page_id{0};
     bst_bin_t n_total_bins{0};
     partitioner_.clear();
-    for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       if (n_total_bins == 0) {
         n_total_bins = page.cut.TotalBins();
       } else {
@@ -406,7 +406,7 @@ class HistBuilder {
     monitor_->Start(__func__);
     auto const &histograms = histogram_builder_->Histogram();
     auto ft = p_fmat->Info().feature_types.ConstHostSpan();
-    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
       break;
     }
@@ -423,7 +423,7 @@ class HistBuilder {
 
     std::size_t page_id = 0;
     auto space = ConstructHistSpace(partitioner_, {node});
-    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       std::vector<CPUExpandEntry> nodes_to_build{node};
       std::vector<CPUExpandEntry> nodes_to_sub;
       this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
@@ -439,7 +439,7 @@ class HistBuilder {
          * Specialized code for dense data: For dense data (with no missing value), the sum
          * of gradient histogram is equal to snode[nid]
          */
-        auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_)).begin());
+        auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_)).begin());
         std::vector<std::uint32_t> const &row_ptr = gmat.cut.Ptrs();
         CHECK_GE(row_ptr.size(), 2);
         std::uint32_t const ibegin = row_ptr[0];
@@ -467,7 +467,7 @@ class HistBuilder {
       std::vector<CPUExpandEntry> entries{node};
       monitor_->Start("EvaluateSplits");
       auto ft = p_fmat->Info().feature_types.ConstHostSpan();
-      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+      for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
         evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
                                    &entries);
         break;
@@ -503,7 +503,7 @@ class HistBuilder {
 
     std::size_t page_id{0};
     auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
                                     partitioner_.at(page_id).Partitions(), nodes_to_build,
                                     nodes_to_sub, gpair.Values());
@@ -515,7 +515,7 @@ class HistBuilder {
                       std::vector<CPUExpandEntry> const &applied) {
     monitor_->Start(__func__);
     std::size_t page_id{0};
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(this->param_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       this->partitioner_.at(page_id).UpdatePosition(this->ctx_, page, applied, p_tree);
       page_id++;
     }
diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc
index de7b9a258..b49350702 100644
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -14,11 +14,12 @@ TEST(DenseColumn, Test) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   BinTypeSize last{kUint8BinsTypeSize};
   for (int32_t max_num_bin : max_num_bins) {
     auto dmat = RandomDataGenerator(100, 10, 0.0).GenerateDMatrix();
     auto sparse_thresh = 0.2;
-    GHistIndexMatrix gmat{dmat.get(), max_num_bin, sparse_thresh, false, AllThreadsForTest()};
+    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, sparse_thresh, false};
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
       column_matrix.InitFromSparse(page, gmat, sparse_thresh, AllThreadsForTest());
@@ -62,9 +63,10 @@ TEST(SparseColumn, Test) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (int32_t max_num_bin : max_num_bins) {
     auto dmat = RandomDataGenerator(100, 1, 0.85).GenerateDMatrix();
-    GHistIndexMatrix gmat{dmat.get(), max_num_bin, 0.5f, false, AllThreadsForTest()};
+    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, 0.5f, false};
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
       column_matrix.InitFromSparse(page, gmat, 1.0, AllThreadsForTest());
@@ -90,9 +92,10 @@ TEST(DenseColumnWithMissing, Test) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (int32_t max_num_bin : max_num_bins) {
     auto dmat = RandomDataGenerator(100, 1, 0.5).GenerateDMatrix();
-    GHistIndexMatrix gmat(dmat.get(), max_num_bin, 0.2, false, AllThreadsForTest());
+    GHistIndexMatrix gmat(&ctx, dmat.get(), max_num_bin, 0.2, false);
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
       column_matrix.InitFromSparse(page, gmat, 0.2, AllThreadsForTest());
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 41c728f35..69ec2cc82 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -156,6 +156,7 @@ TEST(CutsBuilder, SearchGroupInd) {
 }
 
 TEST(HistUtil, DenseCutsCategorical) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
    int categorical_sizes[] = {2, 6, 8, 12};
    int num_bins = 256;
    int sizes[] = {25, 100, 1000};
@@ -165,7 +166,7 @@ TEST(HistUtil, DenseCutsCategorical) {
        std::vector<float> x_sorted(x);
        std::sort(x_sorted.begin(), x_sorted.end());
        auto dmat = GetDMatrixFromData(x, n, 1);
-       HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest());
+       HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
        auto cuts_from_sketch = cuts.Values();
        EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
        EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
@@ -176,6 +177,7 @@ TEST(HistUtil, DenseCutsCategorical) {
 }
 
 TEST(HistUtil, DenseCutsAccuracyTest) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   int bin_sizes[] = {2, 16, 256, 512};
   int sizes[] = {100};
   int num_columns = 5;
@@ -183,7 +185,7 @@ TEST(HistUtil, DenseCutsAccuracyTest) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     for (auto num_bins : bin_sizes) {
-      HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest());
+      HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
@@ -193,6 +195,7 @@ TEST(HistUtil, DenseCutsAccuracyTestWeights) {
   int bin_sizes[] = {2, 16, 256, 512};
   int sizes[] = {100, 1000, 1500};
   int num_columns = 5;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
@@ -200,11 +203,11 @@ TEST(HistUtil, DenseCutsAccuracyTestWeights) {
     dmat->Info().weights_.HostVector() = w;
     for (auto num_bins : bin_sizes) {
       {
-        HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest(), true);
+        HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins, true);
         ValidateCuts(cuts, dmat.get(), num_bins);
       }
       {
-        HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest(), false);
+        HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins, false);
         ValidateCuts(cuts, dmat.get(), num_bins);
       }
     }
@@ -215,6 +218,7 @@ void TestQuantileWithHessian(bool use_sorted) {
   int bin_sizes[] = {2, 16, 256, 512};
   int sizes[] = {1000, 1500};
   int num_columns = 5;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
@@ -225,15 +229,13 @@ void TestQuantileWithHessian(bool use_sorted) {
     dmat->Info().weights_.HostVector() = w;
 
     for (auto num_bins : bin_sizes) {
-      HistogramCuts cuts_hess =
-          SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest(), use_sorted, hessian);
+      HistogramCuts cuts_hess = SketchOnDMatrix(&ctx, dmat.get(), num_bins, use_sorted, hessian);
       for (size_t i = 0; i < w.size(); ++i) {
         dmat->Info().weights_.HostVector()[i] = w[i] * hessian[i];
       }
       ValidateCuts(cuts_hess, dmat.get(), num_bins);
 
-      HistogramCuts cuts_wh =
-          SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest(), use_sorted);
+      HistogramCuts cuts_wh = SketchOnDMatrix(&ctx, dmat.get(), num_bins, use_sorted);
       ValidateCuts(cuts_wh, dmat.get(), num_bins);
 
       ASSERT_EQ(cuts_hess.Values().size(), cuts_wh.Values().size());
@@ -255,12 +257,13 @@ TEST(HistUtil, DenseCutsExternalMemory) {
   int bin_sizes[] = {2, 16, 256, 512};
   int sizes[] = {100, 1000, 1500};
   int num_columns = 5;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     dmlc::TemporaryDirectory tmpdir;
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, tmpdir);
     for (auto num_bins : bin_sizes) {
-      HistogramCuts cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest());
+      HistogramCuts cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
@@ -275,12 +278,12 @@ TEST(HistUtil, IndexBinBound) {
                                            kUint32BinsTypeSize};
   size_t constexpr kRows = 100;
   size_t constexpr kCols = 10;
-
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   size_t bin_id = 0;
   for (auto max_bin : bin_sizes) {
     auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
 
-    GHistIndexMatrix hmat(p_fmat.get(), max_bin, 0.5, false, AllThreadsForTest());
+    GHistIndexMatrix hmat(&ctx, p_fmat.get(), max_bin, 0.5, false);
     EXPECT_EQ(hmat.index.Size(), kRows*kCols);
     EXPECT_EQ(expected_bin_type_sizes[bin_id++], hmat.index.GetBinTypeSize());
   }
@@ -300,10 +303,11 @@ TEST(HistUtil, IndexBinData) {
                                      static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
   size_t constexpr kRows = 100;
   size_t constexpr kCols = 10;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
 
   for (auto max_bin : kBinSizes) {
     auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
-    GHistIndexMatrix hmat(p_fmat.get(), max_bin, 0.5, false, AllThreadsForTest());
+    GHistIndexMatrix hmat(&ctx, p_fmat.get(), max_bin, 0.5, false);
     uint32_t const* offsets = hmat.index.Offset();
     EXPECT_EQ(hmat.index.Size(), kRows*kCols);
     switch (max_bin) {
@@ -327,10 +331,10 @@ void TestSketchFromWeights(bool with_group) {
   size_t constexpr kRows = 300, kCols = 20, kBins = 256;
   size_t constexpr kGroups = 10;
   auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
-  common::HistogramCuts cuts = SketchOnDMatrix(m.get(), kBins, AllThreadsForTest());
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);
 
   MetaInfo info;
-  Context ctx;
   auto& h_weights = info.weights_.HostVector();
   if (with_group) {
     h_weights.resize(kGroups);
@@ -363,7 +367,7 @@ void TestSketchFromWeights(bool with_group) {
 
   if (with_group) {
     m->Info().weights_ = decltype(m->Info().weights_)();  // remove weight
-    HistogramCuts non_weighted = SketchOnDMatrix(m.get(), kBins, AllThreadsForTest());
+    HistogramCuts non_weighted = SketchOnDMatrix(&ctx, m.get(), kBins);
     for (size_t i = 0; i < cuts.Values().size(); ++i) {
       EXPECT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
     }
@@ -382,7 +386,7 @@ void TestSketchFromWeights(bool with_group) {
     for (size_t i = 0; i < h_weights.size(); ++i) {
       h_weights[i] = static_cast<float>(i + 1) / static_cast<float>(kGroups);
     }
-    HistogramCuts weighted = SketchOnDMatrix(m.get(), kBins, AllThreadsForTest());
+    HistogramCuts weighted = SketchOnDMatrix(&ctx, m.get(), kBins);
     ValidateCuts(weighted, m.get(), kBins);
   }
 }
@@ -393,11 +397,12 @@ TEST(HistUtil, SketchFromWeights) {
 }
 
 TEST(HistUtil, SketchCategoricalFeatures) {
-  TestCategoricalSketch(1000, 256, 32, false, [](DMatrix* p_fmat, int32_t num_bins) {
-    return SketchOnDMatrix(p_fmat, num_bins, AllThreadsForTest());
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  TestCategoricalSketch(1000, 256, 32, false, [&ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return SketchOnDMatrix(&ctx, p_fmat, num_bins);
   });
-  TestCategoricalSketch(1000, 256, 32, true, [](DMatrix* p_fmat, int32_t num_bins) {
-    return SketchOnDMatrix(p_fmat, num_bins, AllThreadsForTest());
+  TestCategoricalSketch(1000, 256, 32, true, [&ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return SketchOnDMatrix(&ctx, p_fmat, num_bins);
   });
 }
 }  // namespace common
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 45948b711..e907a9f72 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -25,9 +25,9 @@ namespace xgboost {
 namespace common {
 
 template <typename AdapterT>
-HistogramCuts GetHostCuts(AdapterT *adapter, int num_bins, float missing) {
+HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, float missing) {
   data::SimpleDMatrix dmat(adapter, missing, 1);
-  HistogramCuts cuts = SketchOnDMatrix(&dmat, num_bins, AllThreadsForTest());
+  HistogramCuts cuts = SketchOnDMatrix(ctx, &dmat, num_bins);
   return cuts;
 }
 
@@ -39,7 +39,9 @@ TEST(HistUtil, DeviceSketch) {
   auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
 
   auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
-  HistogramCuts host_cuts = SketchOnDMatrix(dmat.get(), num_bins, AllThreadsForTest());
+
+  Context ctx;
+  HistogramCuts host_cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
 
   EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
   EXPECT_EQ(device_cuts.Ptrs(), host_cuts.Ptrs());
@@ -308,7 +310,8 @@ TEST(HistUtil, AdapterDeviceSketch) {
   data::CupyAdapter adapter(str);
 
   auto device_cuts = MakeUnweightedCutsForTest(adapter, num_bins, missing);
-  auto host_cuts = GetHostCuts(&adapter, num_bins, missing);
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  auto host_cuts = GetHostCuts(&ctx, &adapter, num_bins, missing);
 
   EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
   EXPECT_EQ(device_cuts.Ptrs(), host_cuts.Ptrs());
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 4771cc9bf..a65969a6c 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -16,7 +16,8 @@ TEST(Quantile, LoadBalance) {
   size_t constexpr kRows = 1000, kCols = 100;
   auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
   std::vector<bst_feature_t> cols_ptr;
-  for (auto const& page : m->GetBatches<SparsePage>()) {
+  Context ctx;
+  for (auto const& page : m->GetBatches<SparsePage>(&ctx)) {
     data::SparsePageAdapterBatch adapter{page.GetView()};
     cols_ptr = LoadBalance(adapter, page.data.Size(), kCols, 13, [](auto) { return true; });
   }
@@ -43,6 +44,7 @@ void PushPage(HostSketchContainer* container, SparsePage const& page, MetaInfo c
 
 template <bool use_column>
 void DoTestDistributedQuantile(size_t rows, size_t cols) {
+  Context ctx;
   auto const world = collective::GetWorldSize();
   std::vector<MetaInfo> infos(2);
   auto& h_weights = infos.front().weights_.HostVector();
@@ -51,7 +53,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   SimpleRealUniformDistribution<float> dist(3, 1000);
   std::generate(h_weights.begin(), h_weights.end(), [&]() { return dist(&lcg); });
   std::vector<bst_row_t> column_size(cols, rows);
-  size_t n_bins = 64;
+  bst_bin_t n_bins = 64;
 
   // Generate cuts for distributed environment.
   auto sparsity = 0.5f;
@@ -72,15 +74,15 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   std::vector<float> hessian(rows, 1.0);
   auto hess = Span<float const>{hessian};
 
-  ContainerType<use_column> sketch_distributed(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                               column_size, false, AllThreadsForTest());
+  ContainerType<use_column> sketch_distributed(
+      &ctx, n_bins, m->Info().feature_types.ConstHostSpan(), column_size, false);
 
   if (use_column) {
-    for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+    for (auto const& page : m->GetBatches<SortedCSCPage>(&ctx)) {
       PushPage(&sketch_distributed, page, m->Info(), hess);
     }
   } else {
-    for (auto const& page : m->GetBatches<SparsePage>()) {
+    for (auto const& page : m->GetBatches<SparsePage>(&ctx)) {
       PushPage(&sketch_distributed, page, m->Info(), hess);
     }
   }
@@ -93,8 +95,8 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   CHECK_EQ(collective::GetWorldSize(), 1);
   std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
   m->Info().num_row_ = world * rows;
-  ContainerType<use_column> sketch_on_single_node(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                  column_size, false, AllThreadsForTest());
+  ContainerType<use_column> sketch_on_single_node(
+      &ctx, n_bins, m->Info().feature_types.ConstHostSpan(), column_size, false);
   m->Info().num_row_ = rows;
 
   for (auto rank = 0; rank < world; ++rank) {
@@ -106,7 +108,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
                  .Upper(1.0f)
                  .GenerateDMatrix();
     if (use_column) {
-      for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+      for (auto const& page : m->GetBatches<SortedCSCPage>(&ctx)) {
         PushPage(&sketch_on_single_node, page, m->Info(), hess);
       }
     } else {
@@ -172,6 +174,7 @@ TEST(Quantile, SortedDistributed) {
 namespace {
 template <bool use_column>
 void DoTestColSplitQuantile(size_t rows, size_t cols) {
+  Context ctx;
   auto const world = collective::GetWorldSize();
   auto const rank = collective::GetRank();
 
@@ -204,17 +207,17 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
   // Generate cuts for distributed environment.
   HistogramCuts distributed_cuts;
   {
-    ContainerType<use_column> sketch_distributed(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                 column_size, false, AllThreadsForTest());
+    ContainerType<use_column> sketch_distributed(
+        &ctx, n_bins, m->Info().feature_types.ConstHostSpan(), column_size, false);
 
     std::vector<float> hessian(rows, 1.0);
     auto hess = Span<float const>{hessian};
     if (use_column) {
-      for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+      for (auto const& page : m->GetBatches<SortedCSCPage>(&ctx)) {
         PushPage(&sketch_distributed, page, m->Info(), hess);
       }
     } else {
-      for (auto const& page : m->GetBatches<SparsePage>()) {
+      for (auto const& page : m->GetBatches<SparsePage>(&ctx)) {
         PushPage(&sketch_distributed, page, m->Info(), hess);
       }
     }
@@ -227,17 +230,17 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
   CHECK_EQ(collective::GetWorldSize(), 1);
   HistogramCuts single_node_cuts;
   {
-    ContainerType<use_column> sketch_on_single_node(n_bins, m->Info().feature_types.ConstHostSpan(),
-                                                    column_size, false, AllThreadsForTest());
+    ContainerType<use_column> sketch_on_single_node(
+        &ctx, n_bins, m->Info().feature_types.ConstHostSpan(), column_size, false);
 
     std::vector<float> hessian(rows, 1.0);
     auto hess = Span<float const>{hessian};
     if (use_column) {
-      for (auto const& page : m->GetBatches<SortedCSCPage>()) {
+      for (auto const& page : m->GetBatches<SortedCSCPage>(&ctx)) {
         PushPage(&sketch_on_single_node, page, m->Info(), hess);
       }
     } else {
-      for (auto const& page : m->GetBatches<SparsePage>()) {
+      for (auto const& page : m->GetBatches<SparsePage>(&ctx)) {
         PushPage(&sketch_on_single_node, page, m->Info(), hess);
       }
     }
@@ -299,8 +302,10 @@ namespace {
 void TestSameOnAllWorkers() {
   auto const world = collective::GetWorldSize();
   constexpr size_t kRows = 1000, kCols = 100;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+
   RunWithSeedsAndBins(
-      kRows, [=](int32_t seed, size_t n_bins, MetaInfo const&) {
+      kRows, [=, &ctx](int32_t seed, size_t n_bins, MetaInfo const&) {
         auto rank = collective::GetRank();
         HostDeviceVector<float> storage;
         std::vector<FeatureType> ft(kCols);
@@ -314,7 +319,7 @@ void TestSameOnAllWorkers() {
                      .MaxCategory(17)
                      .Seed(rank + seed)
                      .GenerateDMatrix();
-        auto cuts = SketchOnDMatrix(m.get(), n_bins, AllThreadsForTest());
+        auto cuts = SketchOnDMatrix(&ctx, m.get(), n_bins);
         std::vector<float> cut_values(cuts.Values().size() * world, 0);
         std::vector<
             typename std::remove_reference_t<decltype(cuts.Ptrs())>::value_type>
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index dccf85092..d56f1c7b5 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -1,17 +1,17 @@
-/*!
- * Copyright 2019-2020 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <xgboost/base.h>
 
 #include <utility>
 
-#include "../helpers.h"
-#include "../histogram_helpers.h"
-#include "gtest/gtest.h"
-
 #include "../../../src/common/categorical.h"
 #include "../../../src/common/hist_util.h"
 #include "../../../src/data/ellpack_page.cuh"
+#include "../../../src/tree/param.h"  // TrainParam
+#include "../helpers.h"
+#include "../histogram_helpers.h"
+#include "gtest/gtest.h"
 
 namespace xgboost {
 
@@ -19,7 +19,10 @@ TEST(EllpackPage, EmptyDMatrix) {
   constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256;
   constexpr float kSparsity = 0;
   auto dmat = RandomDataGenerator(kNRows, kNCols, kSparsity).GenerateDMatrix();
-  auto& page = *dmat->GetBatches<EllpackPage>({0, kMaxBin}).begin();
+  Context ctx{MakeCUDACtx(0)};
+  auto& page = *dmat->GetBatches<EllpackPage>(
+                        &ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()})
+                    .begin();
   auto impl = page.Impl();
   ASSERT_EQ(impl->row_stride, 0);
   ASSERT_EQ(impl->Cuts().TotalBins(), 0);
@@ -87,8 +90,9 @@ TEST(EllpackPage, FromCategoricalBasic) {
   auto& h_ft = m->Info().feature_types.HostVector();
   h_ft.resize(kCols, FeatureType::kCategorical);
 
-  BatchParam p{0, max_bins};
-  auto ellpack = EllpackPage(m.get(), p);
+  Context ctx{MakeCUDACtx(0)};
+  auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
+  auto ellpack = EllpackPage(&ctx, m.get(), p);
   auto accessor = ellpack.Impl()->GetDeviceAccessor(0);
   ASSERT_EQ(kCats, accessor.NumBins());
 
@@ -142,8 +146,9 @@ TEST(EllpackPage, Copy) {
   dmlc::TemporaryDirectory tmpdir;
   std::unique_ptr<DMatrix>
       dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
-  BatchParam param{0, 256};
-  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
   EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
@@ -151,7 +156,7 @@ TEST(EllpackPage, Copy) {
 
   // Copy batch pages into the result page.
   size_t offset = 0;
-  for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     size_t num_elements = result.Copy(0, batch.Impl(), offset);
     offset += num_elements;
   }
@@ -161,7 +166,7 @@ TEST(EllpackPage, Copy) {
   thrust::device_vector<bst_float> row_result_d(kCols);
   std::vector<bst_float> row(kCols);
   std::vector<bst_float> row_result(kCols);
-  for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& page : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     auto impl = page.Impl();
     EXPECT_EQ(impl->base_rowid, current_row);
 
@@ -186,10 +191,11 @@ TEST(EllpackPage, Compact) {
 
   // Create a DMatrix with multiple batches.
   dmlc::TemporaryDirectory tmpdir;
-  std::unique_ptr<DMatrix>
-      dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
-  BatchParam param{0, 256};
-  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  std::unique_ptr<DMatrix> dmat(
+      CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
   EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
@@ -201,7 +207,7 @@ TEST(EllpackPage, Compact) {
     SIZE_MAX};
   thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
   common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
-  for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     result.Compact(0, batch.Impl(), row_indexes_span);
   }
 
@@ -210,7 +216,7 @@ TEST(EllpackPage, Compact) {
   thrust::device_vector<bst_float> row_result_d(kCols);
   std::vector<bst_float> row(kCols);
   std::vector<bst_float> row_result(kCols);
-  for (auto& page : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& page : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     auto impl = page.Impl();
     ASSERT_EQ(impl->base_rowid, current_row);
 
@@ -245,15 +251,17 @@ class EllpackPageTest : public testing::TestWithParam<float> {
     // device.
     size_t n_samples{128}, n_features{13};
     Context ctx;
-    ctx.gpu_id = 0;
+    Context gpu_ctx{MakeCUDACtx(0)};
     auto Xy = RandomDataGenerator{n_samples, n_features, sparsity}.GenerateDMatrix(true);
     std::unique_ptr<EllpackPageImpl> from_ghist;
     ASSERT_TRUE(Xy->SingleColBlock());
-    for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(BatchParam{17, 0.6})) {
-      from_ghist.reset(new EllpackPageImpl{&ctx, page, {}});
+
+    for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{17, 0.6})) {
+      from_ghist.reset(new EllpackPageImpl{&gpu_ctx, page, {}});
     }
 
-    for (auto const& page : Xy->GetBatches<EllpackPage>(BatchParam{0, 17})) {
+    for (auto const& page : Xy->GetBatches<EllpackPage>(
+             &gpu_ctx, BatchParam{17, tree::TrainParam::DftSparseThreshold()})) {
       auto from_sparse_page = page.Impl();
       ASSERT_EQ(from_sparse_page->is_dense, from_ghist->is_dense);
       ASSERT_EQ(from_sparse_page->base_rowid, 0);
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 92b4acf4b..66d4024ec 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -1,17 +1,21 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_source.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../../../src/tree/param.h"  // TrainParam
+#include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost {
 namespace data {
 TEST(EllpackPageRawFormat, IO) {
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+
   std::unique_ptr<SparsePageFormat<EllpackPage>> format{CreatePageFormat<EllpackPage>("raw")};
 
   auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
@@ -20,7 +24,7 @@ TEST(EllpackPageRawFormat, IO) {
 
   {
     std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
-    for (auto const &ellpack : m->GetBatches<EllpackPage>({0, 256})) {
+    for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
       format->Write(ellpack, fo.get());
     }
   }
@@ -29,7 +33,7 @@ TEST(EllpackPageRawFormat, IO) {
   std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
   format->Read(&page, fi.get());
 
-  for (auto const &ellpack : m->GetBatches<EllpackPage>({0, 256})) {
+  for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
     auto loaded = page.Impl();
     auto orig = ellpack.Impl();
     ASSERT_EQ(loaded->Cuts().Ptrs(), orig->Cuts().Ptrs());
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index 93194972f..c1e1f80a9 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -2,20 +2,38 @@
  * Copyright 2021-2023 by XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>
+#include <xgboost/data.h>                       // for BatchIterator, BatchSet, DMatrix, BatchParam
 
-#include "../../../src/common/column_matrix.h"
-#include "../../../src/common/io.h"  // MemoryBufferStream
-#include "../../../src/data/gradient_index.h"
-#include "../helpers.h"
+#include <algorithm>                            // for sort, unique
+#include <cmath>                                // for isnan
+#include <cstddef>                              // for size_t
+#include <limits>                               // for numeric_limits
+#include <memory>                               // for shared_ptr, __shared_ptr_access, unique_ptr
+#include <string>                               // for string
+#include <tuple>                                // for make_tuple, tie, tuple
+#include <utility>                              // for move
+#include <vector>                               // for vector
+
+#include "../../../src/common/categorical.h"    // for AsCat
+#include "../../../src/common/column_matrix.h"  // for ColumnMatrix
+#include "../../../src/common/hist_util.h"      // for Index, HistogramCuts, SketchOnDMatrix
+#include "../../../src/common/io.h"             // for MemoryBufferStream
+#include "../../../src/data/adapter.h"          // for SparsePageAdapterBatch
+#include "../../../src/data/gradient_index.h"   // for GHistIndexMatrix
+#include "../../../src/tree/param.h"            // for TrainParam
+#include "../helpers.h"                         // for CreateEmptyGenericParam, GenerateRandomCa...
+#include "xgboost/base.h"                       // for bst_bin_t
+#include "xgboost/context.h"                    // for Context
+#include "xgboost/host_device_vector.h"         // for HostDeviceVector
 
 namespace xgboost {
 namespace data {
 TEST(GradientIndex, ExternalMemory) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
   std::vector<size_t> base_rowids;
   std::vector<float> hessian(dmat->Info().num_row_, 1);
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>({64, hessian, true})) {
+  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
     base_rowids.push_back(page.base_rowid);
   }
   size_t i = 0;
@@ -24,9 +42,8 @@ TEST(GradientIndex, ExternalMemory) {
     ++i;
   }
 
-
   base_rowids.clear();
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>({64, hessian, false})) {
+  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
     base_rowids.push_back(page.base_rowid);
   }
   i = 0;
@@ -41,12 +58,13 @@ TEST(GradientIndex, FromCategoricalBasic) {
   size_t max_bins = 8;
   auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
   auto m = GetDMatrixFromData(x, kRows, 1);
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
 
   auto &h_ft = m->Info().feature_types.HostVector();
   h_ft.resize(kCols, FeatureType::kCategorical);
 
   BatchParam p(max_bins, 0.8);
-  GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, AllThreadsForTest(), {});
+  GHistIndexMatrix gidx(&ctx, m.get(), max_bins, p.sparse_thresh, false, {});
 
   auto x_copy = x;
   std::sort(x_copy.begin(), x_copy.end());
@@ -80,11 +98,11 @@ TEST(GradientIndex, FromCategoricalLarge) {
 
   BatchParam p{max_bins, 0.8};
   {
-    GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, AllThreadsForTest(), {});
+    GHistIndexMatrix gidx{&ctx, m.get(), max_bins, p.sparse_thresh, false, {}};
     ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
   }
   {
-    for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(&ctx, p)) {
       common::HistogramCuts cut = page.cut;
       GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
       ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
@@ -96,10 +114,11 @@ TEST(GradientIndex, PushBatch) {
   size_t constexpr kRows = 64, kCols = 4;
   bst_bin_t max_bins = 64;
   float st = 0.5;
+  Context ctx;
 
   auto test = [&](float sparisty) {
     auto m = RandomDataGenerator{kRows, kCols, sparisty}.GenerateDMatrix(true);
-    auto cuts = common::SketchOnDMatrix(m.get(), max_bins, AllThreadsForTest(), false, {});
+    auto cuts = common::SketchOnDMatrix(&ctx, m.get(), max_bins, false, {});
     common::HistogramCuts copy_cuts = cuts;
 
     ASSERT_EQ(m->Info().num_row_, kRows);
@@ -112,7 +131,7 @@ TEST(GradientIndex, PushBatch) {
                             m->Info().num_row_);
       gmat.PushAdapterBatchColumns(m->Ctx(), batch, std::numeric_limits<float>::quiet_NaN(), 0);
     }
-    for (auto const &page : m->GetBatches<GHistIndexMatrix>(BatchParam{max_bins, st})) {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{max_bins, st})) {
       for (size_t i = 0; i < kRows; ++i) {
         for (size_t j = 0; j < kCols; ++j) {
           auto v0 = gmat.GetFvalue(i, j, false);
@@ -143,17 +162,19 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
     // device.
     size_t n_samples{128}, n_features{13};
     Context ctx;
-    ctx.gpu_id = 0;
     auto Xy = RandomDataGenerator{n_samples, n_features, 1 - density}.GenerateDMatrix(true);
     std::unique_ptr<GHistIndexMatrix> from_ellpack;
     ASSERT_TRUE(Xy->SingleColBlock());
     bst_bin_t constexpr kBins{17};
     auto p = BatchParam{kBins, threshold};
-    for (auto const &page : Xy->GetBatches<EllpackPage>(BatchParam{0, kBins})) {
+    Context gpu_ctx;
+    gpu_ctx.gpu_id = 0;
+    for (auto const &page : Xy->GetBatches<EllpackPage>(
+             &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
       from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
     }
 
-    for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(p)) {
+    for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
       ASSERT_EQ(from_sparse_page.IsDense(), from_ellpack->IsDense());
       ASSERT_EQ(from_sparse_page.base_rowid, 0);
       ASSERT_EQ(from_sparse_page.base_rowid, from_ellpack->base_rowid);
diff --git a/tests/cpp/data/test_gradient_index_page_raw_format.cc b/tests/cpp/data/test_gradient_index_page_raw_format.cc
index fa1a10faa..570d1dbca 100644
--- a/tests/cpp/data/test_gradient_index_page_raw_format.cc
+++ b/tests/cpp/data/test_gradient_index_page_raw_format.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
@@ -11,6 +11,8 @@
 namespace xgboost {
 namespace data {
 TEST(GHistIndexPageRawFormat, IO) {
+  Context ctx;
+
   std::unique_ptr<SparsePageFormat<GHistIndexMatrix>> format{
       CreatePageFormat<GHistIndexMatrix>("raw")};
   auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
@@ -20,7 +22,7 @@ TEST(GHistIndexPageRawFormat, IO) {
 
   {
     std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
-    for (auto const &index : m->GetBatches<GHistIndexMatrix>(batch)) {
+    for (auto const &index : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
       format->Write(index, fo.get());
     }
   }
@@ -29,7 +31,7 @@ TEST(GHistIndexPageRawFormat, IO) {
   std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
   format->Read(&page, fi.get());
 
-  for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(batch)) {
+  for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
     auto const &loaded = gidx;
     ASSERT_EQ(loaded.cut.Ptrs(), page.cut.Ptrs());
     ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
@@ -43,5 +45,5 @@ TEST(GHistIndexPageRawFormat, IO) {
     ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
   }
 }
-} // namespace data
-} // namespace xgboost
+}  // namespace data
+}  // namespace xgboost
diff --git a/tests/cpp/data/test_iterative_dmatrix.cc b/tests/cpp/data/test_iterative_dmatrix.cc
index f95f7c03c..74a69e109 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cc
+++ b/tests/cpp/data/test_iterative_dmatrix.cc
@@ -15,8 +15,9 @@
 namespace xgboost {
 namespace data {
 TEST(IterativeDMatrix, Ref) {
+  Context ctx;
   TestRefDMatrix<GHistIndexMatrix, NumpyArrayIterForTest>(
-      [&](GHistIndexMatrix const& page) { return page.cut; });
+      &ctx, [&](GHistIndexMatrix const& page) { return page.cut; });
 }
 
 TEST(IterativeDMatrix, IsDense) {
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index be97a3f6a..2f2f1f84f 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -1,11 +1,12 @@
-/*!
- * Copyright 2020-2022 XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/iterative_dmatrix.h"
+#include "../../../src/tree/param.h"  // TrainParam
 #include "../helpers.h"
 #include "test_iterative_dmatrix.h"
 
@@ -13,15 +14,17 @@ namespace xgboost {
 namespace data {
 
 void TestEquivalent(float sparsity) {
+  Context ctx{MakeCUDACtx(0)};
+
   CudaArrayIterForTest iter{sparsity};
   IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
                      std::numeric_limits<float>::quiet_NaN(), 0, 256);
-  size_t offset = 0;
-  auto first = (*m.GetEllpackBatches({}).begin()).Impl();
+  std::size_t offset = 0;
+  auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
   std::unique_ptr<EllpackPageImpl> page_concatenated {
     new EllpackPageImpl(0, first->Cuts(), first->is_dense,
                         first->row_stride, 1000 * 100)};
-  for (auto& batch : m.GetBatches<EllpackPage>({})) {
+  for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
     auto page = batch.Impl();
     size_t num_elements = page_concatenated->Copy(0, page, offset);
     offset += num_elements;
@@ -34,8 +37,8 @@ void TestEquivalent(float sparsity) {
   auto adapter = CupyAdapter(interface_str);
   std::unique_ptr<DMatrix> dm{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
-  BatchParam bp {0, 256};
-  for (auto& ellpack : dm->GetBatches<EllpackPage>(bp)) {
+  auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
     auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
 
     std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
@@ -92,7 +95,8 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
                      std::numeric_limits<float>::quiet_NaN(), 0, 256);
   size_t n_batches = 0;
   std::string interface_str = iter.AsArray();
-  for (auto& ellpack : m.GetBatches<EllpackPage>({})) {
+  Context ctx{MakeCUDACtx(0)};
+  for (auto& ellpack : m.GetBatches<EllpackPage>(&ctx, {})) {
     n_batches ++;
     auto impl = ellpack.Impl();
     common::CompressedIterator<uint32_t> iterator(
@@ -140,7 +144,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
 
   IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
                      std::numeric_limits<float>::quiet_NaN(), 0, 256);
-  auto &ellpack = *m.GetBatches<EllpackPage>({0, 256}).begin();
+  auto ctx = MakeCUDACtx(0);
+  auto& ellpack =
+      *m.GetBatches<EllpackPage>(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()})
+           .begin();
   auto impl = ellpack.Impl();
   common::CompressedIterator<uint32_t> iterator(
       impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
@@ -171,8 +178,9 @@ TEST(IterativeDeviceDMatrix, IsDense) {
 }
 
 TEST(IterativeDeviceDMatrix, Ref) {
+  Context ctx{MakeCUDACtx(0)};
   TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
-      [](EllpackPage const& page) { return page.Impl()->Cuts(); });
+      &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
 }
 }  // namespace data
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_iterative_dmatrix.h b/tests/cpp/data/test_iterative_dmatrix.h
index 588d2b3be..ed8e2da77 100644
--- a/tests/cpp/data/test_iterative_dmatrix.h
+++ b/tests/cpp/data/test_iterative_dmatrix.h
@@ -1,8 +1,11 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023, XGBoost contributors
  */
 #pragma once
-#include <memory>  // std::make_shared
+#include <xgboost/context.h>  // for Context
+
+#include <limits>             // for numeric_limits
+#include <memory>             // for make_shared
 
 #include "../../../src/data/iterative_dmatrix.h"
 #include "../helpers.h"
@@ -10,7 +13,7 @@
 namespace xgboost {
 namespace data {
 template <typename Page, typename Iter, typename Cuts>
-void TestRefDMatrix(Cuts&& get_cuts) {
+void TestRefDMatrix(Context const* ctx, Cuts&& get_cuts) {
   int n_bins = 256;
   Iter iter(0.3, 2048);
   auto m = std::make_shared<IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
@@ -20,8 +23,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
   auto m_1 = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), m, Reset, Next,
                                                 std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
 
-  for (auto const& page_0 : m->template GetBatches<Page>({})) {
-    for (auto const& page_1 : m_1->template GetBatches<Page>({})) {
+  for (auto const& page_0 : m->template GetBatches<Page>(ctx, {})) {
+    for (auto const& page_1 : m_1->template GetBatches<Page>(ctx, {})) {
       auto const& cuts_0 = get_cuts(page_0);
       auto const& cuts_1 = get_cuts(page_1);
       ASSERT_EQ(cuts_0.Values(), cuts_1.Values());
@@ -32,8 +35,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
 
   m_1 = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), nullptr, Reset, Next,
                                            std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
-  for (auto const& page_0 : m->template GetBatches<Page>({})) {
-    for (auto const& page_1 : m_1->template GetBatches<Page>({})) {
+  for (auto const& page_0 : m->template GetBatches<Page>(ctx, {})) {
+    for (auto const& page_1 : m_1->template GetBatches<Page>(ctx, {})) {
       auto const& cuts_0 = get_cuts(page_0);
       auto const& cuts_1 = get_cuts(page_1);
       ASSERT_NE(cuts_0.Values(), cuts_1.Values());
@@ -45,8 +48,8 @@ void TestRefDMatrix(Cuts&& get_cuts) {
   auto dm = RandomDataGenerator(2048, Iter::Cols(), 0.5).GenerateDMatrix(true);
   auto dqm = std::make_shared<IterativeDMatrix>(&iter_1, iter_1.Proxy(), dm, Reset, Next,
                                                 std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
-  for (auto const& page_0 : dm->template GetBatches<Page>({})) {
-    for (auto const& page_1 : dqm->template GetBatches<Page>({})) {
+  for (auto const& page_0 : dm->template GetBatches<Page>(ctx, {})) {
+    for (auto const& page_1 : dqm->template GetBatches<Page>(ctx, {})) {
       auto const& cuts_0 = get_cuts(page_0);
       auto const& cuts_1 = get_cuts(page_1);
       ASSERT_EQ(cuts_0.Values(), cuts_1.Values());
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index 3bdbf5403..43d0877d3 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -61,6 +61,7 @@ TEST(SimpleDMatrix, RowAccess) {
 }
 
 TEST(SimpleDMatrix, ColAccessWithoutBatches) {
+  Context ctx;
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
@@ -70,7 +71,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
 
   // Loop over the batches and assert the data is as expected
   int64_t num_col_batch = 0;
-  for (const auto &batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
+  for (const auto &batch : dmat->GetBatches<xgboost::SortedCSCPage>(&ctx)) {
     num_col_batch += 1;
     EXPECT_EQ(batch.Size(), dmat->Info().num_col_)
         << "Expected batch size = number of cells as #batches is 1.";
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 608c32947..4cbbe6dc9 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -23,7 +23,7 @@ std::string UriSVM(std::string name, std::string cache) {
 }  // namespace
 
 template <typename Page>
-void TestSparseDMatrixLoadFile() {
+void TestSparseDMatrixLoadFile(Context const* ctx) {
   dmlc::TemporaryDirectory tmpdir;
   auto opath = tmpdir.path + "/1-based.svm";
   CreateBigTestData(opath, 3 * 64, false);
@@ -48,7 +48,7 @@ void TestSparseDMatrixLoadFile() {
   data::SimpleDMatrix simple{&adapter, std::numeric_limits<float>::quiet_NaN(),
                              1};
   Page out;
-  for (auto const& page : m.GetBatches<Page>()) {
+  for (auto const &page : m.GetBatches<Page>(ctx)) {
     if (std::is_same<Page, SparsePage>::value) {
       out.Push(page);
     } else {
@@ -58,7 +58,7 @@ void TestSparseDMatrixLoadFile() {
   ASSERT_EQ(m.Info().num_col_, simple.Info().num_col_);
   ASSERT_EQ(m.Info().num_row_, simple.Info().num_row_);
 
-  for (auto const& page : simple.GetBatches<Page>()) {
+  for (auto const& page : simple.GetBatches<Page>(ctx)) {
     ASSERT_EQ(page.offset.HostVector(), out.offset.HostVector());
     for (size_t i = 0; i < page.data.Size(); ++i) {
       ASSERT_EQ(page.data.HostVector()[i].fvalue, out.data.HostVector()[i].fvalue);
@@ -67,16 +67,18 @@ void TestSparseDMatrixLoadFile() {
 }
 
 TEST(SparsePageDMatrix, LoadFile) {
-  TestSparseDMatrixLoadFile<SparsePage>();
-  TestSparseDMatrixLoadFile<CSCPage>();
-  TestSparseDMatrixLoadFile<SortedCSCPage>();
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  TestSparseDMatrixLoadFile<SparsePage>(&ctx);
+  TestSparseDMatrixLoadFile<CSCPage>(&ctx);
+  TestSparseDMatrixLoadFile<SortedCSCPage>(&ctx);
 }
 
 // allow caller to retain pages so they can process multiple pages at the same time.
 template <typename Page>
 void TestRetainPage() {
   auto m = CreateSparsePageDMatrix(10000);
-  auto batches = m->GetBatches<Page>();
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  auto batches = m->GetBatches<Page>(&ctx);
   auto begin = batches.begin();
   auto end = batches.end();
 
@@ -100,7 +102,7 @@ void TestRetainPage() {
   }
 
   // make sure it's const and the caller can not modify the content of page.
-  for (auto& page : m->GetBatches<Page>()) {
+  for (auto &page : m->GetBatches<Page>({&ctx})) {
     static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
   }
 }
@@ -143,10 +145,11 @@ TEST(SparsePageDMatrix, ColAccess) {
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
   xgboost::DMatrix *dmat = xgboost::DMatrix::Load(UriSVM(tmp_file, tmp_file));
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
 
   // Loop over the batches and assert the data is as expected
   size_t iter = 0;
-  for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>()) {
+  for (auto const &col_batch : dmat->GetBatches<xgboost::SortedCSCPage>(&ctx)) {
     auto col_page = col_batch.GetView();
     ASSERT_EQ(col_page.Size(), dmat->Info().num_col_);
     if (iter == 1) {
@@ -164,7 +167,7 @@ TEST(SparsePageDMatrix, ColAccess) {
 
   // Loop over the batches and assert the data is as expected
   iter = 0;
-  for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>()) {
+  for (auto const &col_batch : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
     auto col_page = col_batch.GetView();
     EXPECT_EQ(col_page.Size(), dmat->Info().num_col_);
     if (iter == 0) {
@@ -182,9 +185,9 @@ TEST(SparsePageDMatrix, ColAccess) {
 TEST(SparsePageDMatrix, ThreadSafetyException) {
   size_t constexpr kEntriesPerCol = 3;
   size_t constexpr kEntries = 64 * kEntriesPerCol * 2;
+  Context ctx;
 
-  std::unique_ptr<xgboost::DMatrix> dmat =
-      xgboost::CreateSparsePageDMatrix(kEntries);
+  std::unique_ptr<xgboost::DMatrix> dmat = xgboost::CreateSparsePageDMatrix(kEntries);
 
   int threads = 1000;
 
@@ -221,7 +224,8 @@ TEST(SparsePageDMatrix, ColAccessBatches) {
   // Create multiple sparse pages
   std::unique_ptr<xgboost::DMatrix> dmat{xgboost::CreateSparsePageDMatrix(kEntries)};
   ASSERT_EQ(dmat->Ctx()->Threads(), AllThreadsForTest());
-  for (auto const &page : dmat->GetBatches<xgboost::CSCPage>()) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  for (auto const &page : dmat->GetBatches<xgboost::CSCPage>(&ctx)) {
     ASSERT_EQ(dmat->Info().num_col_, page.Size());
   }
 }
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 55a44e458..846fe7f63 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -1,15 +1,20 @@
 /**
  * Copyright 2019-2023 by XGBoost Contributors
  */
+#include <xgboost/data.h>  // for DMatrix
+
 #include "../../../src/common/compressed_iterator.h"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_dmatrix.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../../../src/tree/param.h"  // TrainParam
+#include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
 namespace xgboost {
 
 TEST(SparsePageDMatrix, EllpackPage) {
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   dmlc::TemporaryDirectory tempdir;
   const std::string tmp_file = tempdir.path + "/simple.libsvm";
   CreateSimpleTestData(tmp_file);
@@ -17,7 +22,7 @@ TEST(SparsePageDMatrix, EllpackPage) {
 
   // Loop over the batches and assert the data is as expected
   size_t n = 0;
-  for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
+  for (const auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     n += batch.Size();
   }
   EXPECT_EQ(n, dmat->Info().num_row_);
@@ -37,6 +42,8 @@ TEST(SparsePageDMatrix, EllpackPage) {
 }
 
 TEST(SparsePageDMatrix, MultipleEllpackPages) {
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   dmlc::TemporaryDirectory tmpdir;
   std::string filename = tmpdir.path + "/big.libsvm";
   size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
@@ -46,7 +53,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
   // Loop over the batches and count the records
   int64_t batch_count = 0;
   int64_t row_count = 0;
-  for (const auto& batch : dmat->GetBatches<EllpackPage>({0, 256})) {
+  for (const auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     EXPECT_LT(batch.Size(), dmat->Info().num_row_);
     batch_count++;
     row_count += batch.Size();
@@ -61,8 +68,11 @@ TEST(SparsePageDMatrix, MultipleEllpackPages) {
 }
 
 TEST(SparsePageDMatrix, RetainEllpackPage) {
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{32, tree::TrainParam::DftSparseThreshold()};
   auto m = CreateSparsePageDMatrix(10000);
-  auto batches = m->GetBatches<EllpackPage>({0, 32});
+
+  auto batches = m->GetBatches<EllpackPage>(&ctx, param);
   auto begin = batches.begin();
   auto end = batches.end();
 
@@ -87,7 +97,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
   }
 
   // make sure it's const and the caller can not modify the content of page.
-  for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
+  for (auto& page : m->GetBatches<EllpackPage>(&ctx, param)) {
     static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
   }
 
@@ -98,6 +108,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
 }
 
 TEST(SparsePageDMatrix, EllpackPageContent) {
+  auto ctx = CreateEmptyGenericParam(0);
   constexpr size_t kRows = 6;
   constexpr size_t kCols = 2;
   constexpr size_t kPageSize = 1;
@@ -110,8 +121,8 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
   std::unique_ptr<DMatrix>
       dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
 
-  BatchParam param{0, 2};
-  auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  auto param = BatchParam{2, tree::TrainParam::DftSparseThreshold()};
+  auto impl = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_EQ(impl->base_rowid, 0);
   EXPECT_EQ(impl->n_rows, kRows);
   EXPECT_FALSE(impl->is_dense);
@@ -120,7 +131,7 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
 
   std::unique_ptr<EllpackPageImpl> impl_ext;
   size_t offset = 0;
-  for (auto& batch : dmat_ext->GetBatches<EllpackPage>(param)) {
+  for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     if (!impl_ext) {
       impl_ext.reset(new EllpackPageImpl(
           batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
@@ -170,8 +181,9 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
   std::unique_ptr<DMatrix>
       dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
 
-  BatchParam param{0, kMaxBins};
-  auto impl = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
+  auto impl = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_EQ(impl->base_rowid, 0);
   EXPECT_EQ(impl->n_rows, kRows);
 
@@ -180,7 +192,7 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
   thrust::device_vector<bst_float> row_ext_d(kCols);
   std::vector<bst_float> row(kCols);
   std::vector<bst_float> row_ext(kCols);
-  for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
+  for (auto& page : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     auto impl_ext = page.Impl();
     EXPECT_EQ(impl_ext->base_rowid, current_row);
 
@@ -211,10 +223,11 @@ TEST(SparsePageDMatrix, EllpackPageMultipleLoops) {
   std::unique_ptr<DMatrix>
       dmat_ext(CreateSparsePageDMatrixWithRC(kRows, kCols, kPageSize, true, tmpdir));
 
-  BatchParam param{0, kMaxBins};
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{kMaxBins, tree::TrainParam::DftSparseThreshold()};
 
   size_t current_row = 0;
-  for (auto& page : dmat_ext->GetBatches<EllpackPage>(param)) {
+  for (auto& page : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     auto impl_ext = page.Impl();
     EXPECT_EQ(impl_ext->base_rowid, current_row);
     current_row += impl_ext->n_rows;
diff --git a/tests/cpp/data/test_sparse_page_raw_format.cc b/tests/cpp/data/test_sparse_page_raw_format.cc
index 5743c4223..722655880 100644
--- a/tests/cpp/data/test_sparse_page_raw_format.cc
+++ b/tests/cpp/data/test_sparse_page_raw_format.cc
@@ -1,17 +1,24 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>
+#include <xgboost/data.h>                          // for CSCPage, SortedCSCPage, SparsePage
 
-#include "../../../src/data/sparse_page_source.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
-#include "../helpers.h"
+#include <memory>                                  // for allocator, unique_ptr, __shared_ptr_ac...
+#include <string>                                  // for char_traits, operator+, basic_string
+
+#include "../../../src/data/sparse_page_writer.h"  // for CreatePageFormat
+#include "../helpers.h"                            // for RandomDataGenerator
+#include "dmlc/filesystem.h"                       // for TemporaryDirectory
+#include "dmlc/io.h"                               // for SeekStream, Stream
+#include "gtest/gtest_pred_impl.h"                 // for Test, AssertionResult, ASSERT_EQ, TEST
+#include "xgboost/context.h"                       // for Context
 
 namespace xgboost {
 namespace data {
 template <typename S> void TestSparsePageRawFormat() {
   std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
+  Context ctx;
 
   auto m = RandomDataGenerator{100, 14, 0.5}.GenerateDMatrix();
   ASSERT_TRUE(m->SingleColBlock());
@@ -21,7 +28,7 @@ template <typename S> void TestSparsePageRawFormat() {
   {
     // block code to flush the stream
     std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
-    for (auto const &page : m->GetBatches<S>()) {
+    for (auto const &page : m->GetBatches<S>(&ctx)) {
       orig.Push(page);
       format->Write(page, fo.get());
     }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 5e65a1636..3a56bd27f 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -388,6 +388,11 @@ inline Context CreateEmptyGenericParam(int gpu_id) {
   return tparam;
 }
 
+/**
+ * \brief Make a context that uses CUDA.
+ */
+inline Context MakeCUDACtx(std::int32_t device) { return Context{}.MakeCUDA(device); }
+
 inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows,
                                                               float lower= 0.0f, float upper = 1.0f) {
   xgboost::SimpleLCG gen;
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 731f85563..a01a21ef6 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -203,7 +203,11 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
     learner->Save(&mem_out);
     ASSERT_EQ(model_at_kiter, serialised_model_tmp);
 
-    learner->SetParam("gpu_id", "0");
+    for (auto const& [key, value] : args) {
+      if (key == "tree_method" && value == "gpu_hist") {
+        learner->SetParam("gpu_id", "0");
+      }
+    }
     // Pull data to device
     for (auto &batch : p_dmat->GetBatches<SparsePage>()) {
       batch.data.SetDevice(0);
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index e211fe70a..95ae02aee 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -1,12 +1,13 @@
-/*!
- * Copyright 2020-2021 by XGBoost Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
 #include "../../../../src/data/ellpack_page.cuh"
 #include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh"
 #include "../../../../src/tree/param.h"
-#include "../../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../../../../src/tree/param.h"  // TrainParam
+#include "../../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../../helpers.h"
 
 namespace xgboost {
@@ -31,14 +32,15 @@ void VerifySampling(size_t page_size,
   }
   gpair.SetDevice(0);
 
-  BatchParam param{0, 256};
-  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   if (page_size != 0) {
     EXPECT_NE(page->n_rows, kRows);
   }
 
-  GradientBasedSampler sampler(page, kRows, param, subsample, sampling_method);
-  auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get());
+  GradientBasedSampler sampler(&ctx, page, kRows, param, subsample, sampling_method);
+  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
 
   if (fixed_size_sampling) {
     EXPECT_EQ(sample.sample_rows, kRows);
@@ -86,12 +88,13 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   auto gpair = GenerateRandomGradients(kRows);
   gpair.SetDevice(0);
 
-  BatchParam param{0, 256};
-  auto page = (*dmat->GetBatches<EllpackPage>(param).begin()).Impl();
+  Context ctx{MakeCUDACtx(0)};
+  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
+  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_NE(page->n_rows, kRows);
 
-  GradientBasedSampler sampler(page, kRows, param, kSubsample, TrainParam::kUniform);
-  auto sample = sampler.Sample(gpair.DeviceSpan(), dmat.get());
+  GradientBasedSampler sampler(&ctx, page, kRows, param, kSubsample, TrainParam::kUniform);
+  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
   auto sampled_page = sample.page;
   EXPECT_EQ(sample.sample_rows, kRows);
   EXPECT_EQ(sample.gpair.size(), gpair.Size());
@@ -103,7 +106,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
       ci(buffer.data(), sampled_page->NumSymbols());
 
   size_t offset = 0;
-  for (auto& batch : dmat->GetBatches<EllpackPage>(param)) {
+  for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
     auto page = batch.Impl();
     std::vector<common::CompressedByteT> page_buffer(page->gidx_buffer.HostVector());
     common::CompressedIterator<common::CompressedByteT>
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 95fe66138..024a1e8d3 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -1,9 +1,14 @@
+/**
+ * Copyright 2020-2023, XGBoost Contributors
+ */
 #include <gtest/gtest.h>
+
 #include <vector>
 
 #include "../../../../src/common/categorical.h"
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
+#include "../../../../src/tree/param.h"  // TrainParam
 #include "../../categorical_helpers.h"
 #include "../../helpers.h"
 
@@ -11,15 +16,15 @@ namespace xgboost {
 namespace tree {
 
 void TestDeterministicHistogram(bool is_dense, int shm_size) {
-  Context ctx = CreateEmptyGenericParam(0);
+  Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
   float constexpr kLower = -1e-2, kUpper = 1e2;
 
   float sparsity = is_dense ? 0.0f : 0.5f;
   auto matrix = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix();
-  BatchParam batch_param{0, static_cast<int32_t>(kBins)};
+  auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
 
-  for (auto const& batch : matrix->GetBatches<EllpackPage>(batch_param)) {
+  for (auto const& batch : matrix->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
 
     tree::RowPartitioner row_partitioner(0, kRows);
@@ -114,13 +119,13 @@ void ValidateCategoricalHistogram(size_t n_categories, common::Span<GradientPair
 
 // Test 1 vs rest categorical histogram is equivalent to one hot encoded data.
 void TestGPUHistogramCategorical(size_t num_categories) {
-  auto ctx = CreateEmptyGenericParam(0);
+  auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows = 340;
   size_t constexpr kBins = 256;
   auto x = GenerateRandomCategoricalSingleColumn(kRows, num_categories);
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
-  BatchParam batch_param{0, static_cast<int32_t>(kBins)};
+  auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
   tree::RowPartitioner row_partitioner(0, kRows);
   auto ridx = row_partitioner.GetRows(0);
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
@@ -130,7 +135,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   /**
    * Generate hist with cat data.
    */
-  for (auto const &batch : cat_m->GetBatches<EllpackPage>(batch_param)) {
+  for (auto const &batch : cat_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
     BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
@@ -144,7 +149,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   auto x_encoded = OneHotEncodeFeature(x, num_categories);
   auto encode_m = GetDMatrixFromData(x_encoded, kRows, num_categories);
   dh::device_vector<GradientPairInt64> encode_hist(2 * num_categories);
-  for (auto const &batch : encode_m->GetBatches<EllpackPage>(batch_param)) {
+  for (auto const &batch : encode_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
     BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index dcd04f68a..c53d9d90b 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -41,7 +41,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
 
   size_t constexpr kMaxBins = 4;
   // dense, no missing values
-  GHistIndexMatrix gmat(dmat.get(), kMaxBins, 0.5, false, AllThreadsForTest());
+  GHistIndexMatrix gmat(&ctx, dmat.get(), kMaxBins, 0.5, false);
   common::RowSetCollection row_set_collection;
   std::vector<size_t> &row_indices = *row_set_collection.Data();
   row_indices.resize(kRows);
@@ -228,7 +228,7 @@ auto CompareOneHotAndPartition(bool onehot) {
   auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
   std::vector<CPUExpandEntry> entries(1);
 
-  for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>({32, param.sparse_threshold})) {
+  for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>(&ctx, {32, param.sparse_threshold})) {
     common::HistCollection hist;
 
     entries.front().nid = 0;
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 3b354bebb..2e620fd10 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -25,6 +25,7 @@ void InitRowPartitionForTest(common::RowSetCollection *row_set, size_t n_samples
 }  // anonymous namespace
 
 void TestAddHistRows(bool is_distributed) {
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
   std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
   int starting_index = std::numeric_limits<int>::max();
@@ -32,9 +33,9 @@ void TestAddHistRows(bool is_distributed) {
 
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
-  auto p_fmat =
-      RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
-  auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(BatchParam{kMaxBins, 0.5}).begin());
+  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  auto const &gmat =
+      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());
 
   RegTree tree;
 
@@ -73,6 +74,7 @@ TEST(CPUHistogram, AddRows) {
 void TestSyncHist(bool is_distributed) {
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
 
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
   std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
@@ -80,9 +82,9 @@ void TestSyncHist(bool is_distributed) {
   int sync_count = 0;
   RegTree tree;
 
-  auto p_fmat =
-      RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
-  auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(BatchParam{kMaxBins, 0.5}).begin());
+  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  auto const &gmat =
+      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());
 
   HistogramBuilder<CPUExpandEntry> histogram;
   uint32_t total_bins = gmat.cut.Ptrs().back();
@@ -227,12 +229,15 @@ TEST(CPUHistogram, SyncHist) {
 void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split) {
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
-  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+  auto p_fmat =
+      RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
   if (is_col_split) {
     p_fmat = std::shared_ptr<DMatrix>{
         p_fmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
   }
-  auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(BatchParam{kMaxBins, 0.5}).begin());
+  auto const &gmat =
+      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());
   uint32_t total_bins = gmat.cut.Ptrs().back();
 
   static double constexpr kEps = 1e-6;
@@ -257,9 +262,9 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
   nodes_for_explicit_hist_build.push_back(node);
-  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>({kMaxBins, 0.5})) {
-    histogram.BuildHist(0, gidx, &tree, row_set_collection,
-                        nodes_for_explicit_hist_build, {}, gpair, force_read_by_column);
+  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {kMaxBins, 0.5})) {
+    histogram.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
+                        gpair, force_read_by_column);
   }
 
   // Check if number of histogram bins is correct
@@ -325,6 +330,8 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   auto x = GenerateRandomCategoricalSingleColumn(kRows, n_categories);
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+
   BatchParam batch_param{0, static_cast<int32_t>(kBins)};
 
   RegTree tree;
@@ -345,12 +352,11 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
    * Generate hist with cat data.
    */
   HistogramBuilder<CPUExpandEntry> cat_hist;
-  for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>({kBins, 0.5})) {
+  for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
     cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
-    cat_hist.BuildHist(0, gidx, &tree, row_set_collection,
-                        nodes_for_explicit_hist_build, {}, gpair.HostVector(),
-                        force_read_by_column);
+    cat_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
+                       gpair.HostVector(), force_read_by_column);
   }
 
   /**
@@ -359,12 +365,11 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   auto x_encoded = OneHotEncodeFeature(x, n_categories);
   auto encode_m = GetDMatrixFromData(x_encoded, kRows, n_categories);
   HistogramBuilder<CPUExpandEntry> onehot_hist;
-  for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>({kBins, 0.5})) {
+  for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
     onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
     onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                          gpair.HostVector(),
-                          force_read_by_column);
+                          gpair.HostVector(), force_read_by_column);
   }
 
   auto cat = cat_hist.Histogram()[0];
@@ -382,8 +387,8 @@ TEST(CPUHistogram, Categorical) {
   }
 }
 namespace {
-void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool force_read_by_column) {
-  Context ctx;
+void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, bool is_approx,
+                                 bool force_read_by_column) {
   size_t constexpr kEntries = 1 << 16;
   auto m = CreateSparsePageDMatrix(kEntries, "cache");
 
@@ -410,7 +415,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
      * Multi page
      */
     std::vector<common::RowSetCollection> rows_set;
-    for (auto const &page : m->GetBatches<GHistIndexMatrix>(batch_param)) {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
       CHECK_LT(page.base_rowid, m->Info().num_row_);
       auto n_rows_in_node = page.Size();
       partition_size[0] = std::max(partition_size[0], n_rows_in_node);
@@ -426,12 +431,12 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
         1, [&](size_t nidx_in_set) { return partition_size.at(nidx_in_set); },
         256};
 
-    multi_build.Reset(total_bins, batch_param, ctx.Threads(), rows_set.size(), false, false);
+    multi_build.Reset(total_bins, batch_param, ctx->Threads(), rows_set.size(), false, false);
 
     size_t page_idx{0};
-    for (auto const &page : m->GetBatches<GHistIndexMatrix>(batch_param)) {
-      multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {},
-                            h_gpair, force_read_by_column);
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
+      multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {}, h_gpair,
+                            force_read_by_column);
       ++page_idx;
     }
     ASSERT_EQ(page_idx, 2);
@@ -447,16 +452,16 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
     common::RowSetCollection row_set_collection;
     InitRowPartitionForTest(&row_set_collection, n_samples);
 
-    single_build.Reset(total_bins, batch_param, ctx.Threads(), 1, false, false);
+    single_build.Reset(total_bins, batch_param, ctx->Threads(), 1, false, false);
     SparsePage concat;
     std::vector<float> hess(m->Info().num_row_, 1.0f);
     for (auto const& page : m->GetBatches<SparsePage>()) {
       concat.Push(page);
     }
 
-    auto cut = common::SketchOnDMatrix(m.get(), batch_param.max_bin, ctx.Threads(), false, hess);
+    auto cut = common::SketchOnDMatrix(ctx, m.get(), batch_param.max_bin, false, hess);
     GHistIndexMatrix gmat(concat, {}, cut, batch_param.max_bin, false,
-                          std::numeric_limits<double>::quiet_NaN(), ctx.Threads());
+                          std::numeric_limits<double>::quiet_NaN(), ctx->Threads());
     single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair, force_read_by_column);
     single_page = single_build.Histogram()[0];
   }
@@ -470,16 +475,17 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
 
 TEST(CPUHistogram, ExternalMemory) {
   int32_t constexpr kBins = 256;
-  TestHistogramExternalMemory(BatchParam{kBins, common::Span<float>{}, false}, true, false);
-  TestHistogramExternalMemory(BatchParam{kBins, common::Span<float>{}, false}, true, true);
+  auto ctx = CreateEmptyGenericParam(Context::kCpuId);
+
+  TestHistogramExternalMemory(&ctx, BatchParam{kBins, common::Span<float>{}, false}, true, false);
+  TestHistogramExternalMemory(&ctx, BatchParam{kBins, common::Span<float>{}, false}, true, true);
 
   float sparse_thresh{0.5};
-  TestHistogramExternalMemory({kBins, sparse_thresh}, false, false);
-  TestHistogramExternalMemory({kBins, sparse_thresh}, false, true);
+  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, false);
+  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
   sparse_thresh = std::numeric_limits<float>::quiet_NaN();
-  TestHistogramExternalMemory({kBins, sparse_thresh}, false, false);
-  TestHistogramExternalMemory({kBins, sparse_thresh}, false, true);
-
+  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, false);
+  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
 }
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 6f2b83511..38da629b1 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -34,7 +34,7 @@ TEST(Approx, Partitioner) {
   std::vector<CPUExpandEntry> candidates{{0, 0}};
   candidates.front().split.loss_chg = 0.4;
 
-  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, {64, hess, true})) {
     bst_feature_t const split_ind = 0;
     {
       auto min_value = page.cut.MinValues()[split_ind];
@@ -84,7 +84,7 @@ void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared
 
   Context ctx;
   ctx.InitAllowUnknown(Args{});
-  for (auto const& page : dmat->GetBatches<GHistIndexMatrix>({64, *hess, true})) {
+  for (auto const& page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, *hess, true})) {
     {
       RegTree tree;
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
@@ -133,7 +133,7 @@ TEST(Approx, PartitionerColSplit) {
   Context ctx;
   ctx.InitAllowUnknown(Args{});
   CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false};
-  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, {64, hess, true})) {
     bst_feature_t const split_ind = 0;
     min_value = page.cut.MinValues()[split_ind];
 
diff --git a/tests/cpp/tree/test_common_partitioner.cc b/tests/cpp/tree/test_common_partitioner.cc
index 7e47ec289..116802c6a 100644
--- a/tests/cpp/tree/test_common_partitioner.cc
+++ b/tests/cpp/tree/test_common_partitioner.cc
@@ -43,7 +43,7 @@ void TestLeafPartition(size_t n_samples) {
 
   std::vector<size_t> h_nptr;
   float split_value{0};
-  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({Context::kCpuId, 64})) {
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{64, 0.2})) {
     bst_feature_t const split_ind = 0;
     auto ptr = page.cut.Ptrs()[split_ind + 1];
     split_value = page.cut.Values().at(ptr / 2);
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 003347c8e..42ab10ed4 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -208,17 +208,16 @@ TEST(GpuHist, TestHistogramIndex) {
   TestHistogramIndexImpl();
 }
 
-void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
-                size_t gpu_page_size, RegTree* tree,
-                HostDeviceVector<bst_float>* preds, float subsample = 1.0f,
-                const std::string& sampling_method = "uniform",
+void UpdateTree(Context const* ctx, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+                size_t gpu_page_size, RegTree* tree, HostDeviceVector<bst_float>* preds,
+                float subsample = 1.0f, const std::string& sampling_method = "uniform",
                 int max_bin = 2) {
-
   if (gpu_page_size > 0) {
     // Loop over the batches and count the records
     int64_t batch_count = 0;
     int64_t row_count = 0;
-    for (const auto& batch : dmat->GetBatches<EllpackPage>({0, max_bin})) {
+    for (const auto& batch : dmat->GetBatches<EllpackPage>(
+             ctx, BatchParam{max_bin, TrainParam::DftSparseThreshold()})) {
       EXPECT_LT(batch.Size(), dmat->Info().num_row_);
       batch_count++;
       row_count += batch.Size();
@@ -239,14 +238,13 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
   TrainParam param;
   param.UpdateAllowUnknown(args);
 
-  Context ctx(CreateEmptyGenericParam(0));
   ObjInfo task{ObjInfo::kRegression};
-  tree::GPUHistMaker hist_maker{&ctx, &task};
+  tree::GPUHistMaker hist_maker{ctx, &task};
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   hist_maker.Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
                     {tree});
-  auto cache = linalg::MakeTensorView(&ctx, preds->DeviceSpan(), preds->Size(), 1);
+  auto cache = linalg::MakeTensorView(ctx, preds->DeviceSpan(), preds->Size(), 1);
   hist_maker.UpdatePredictionCache(dmat, cache);
 }
 
@@ -264,12 +262,13 @@ TEST(GpuHist, UniformSampling) {
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  Context ctx(CreateEmptyGenericParam(0));
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
-             "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform",
+             kRows);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -293,12 +292,13 @@ TEST(GpuHist, GradientBasedSampling) {
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  Context ctx(CreateEmptyGenericParam(0));
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
 
   // Build another tree using sampling.
   RegTree tree_sampling;
   HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
              "gradient_based", kRows);
 
   // Make sure the predictions are the same.
@@ -327,12 +327,13 @@ TEST(GpuHist, ExternalMemory) {
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
+  Context ctx(CreateEmptyGenericParam(0));
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using multiple ELLPACK pages.
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
+  UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
@@ -364,17 +365,17 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
   // Build a tree using the in-memory DMatrix.
   auto rng = common::GlobalRandom();
 
+  Context ctx(CreateEmptyGenericParam(0));
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod,
-             kRows);
+  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
 
   // Build another tree using multiple ELLPACK pages.
   common::GlobalRandom() = rng;
   RegTree tree_ext;
   HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
-  UpdateTree(&gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext,
-             kSubsample, kSamplingMethod, kRows);
+  UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample,
+             kSamplingMethod, kRows);
 
   // Make sure the predictions are the same.
   auto preds_h = preds.ConstHostVector();
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 2aa1b8f47..e5ce75585 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -36,7 +36,7 @@ void TestPartitioner(bst_target_t n_targets) {
   std::vector<ExpandEntry> candidates{{0, 0}};
   candidates.front().split.loss_chg = 0.4;
 
-  auto cuts = common::SketchOnDMatrix(Xy.get(), 64, ctx.Threads());
+  auto cuts = common::SketchOnDMatrix(&ctx, Xy.get(), 64);
 
   for (auto const& page : Xy->GetBatches<SparsePage>()) {
     GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
diff --git a/tests/cpp/tree/test_regen.cc b/tests/cpp/tree/test_regen.cc
index b766e0775..d0fe5b449 100644
--- a/tests/cpp/tree/test_regen.cc
+++ b/tests/cpp/tree/test_regen.cc
@@ -15,16 +15,17 @@ class DMatrixForTest : public data::SimpleDMatrix {
 
  public:
   using SimpleDMatrix::SimpleDMatrix;
-  BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) override {
+  BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx,
+                                              const BatchParam& param) override {
     auto backup = this->gradient_index_;
-    auto iter = SimpleDMatrix::GetGradientIndex(param);
+    auto iter = SimpleDMatrix::GetGradientIndex(ctx, param);
     n_regen_ += (backup != this->gradient_index_);
     return iter;
   }
 
-  BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override {
+  BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, const BatchParam& param) override {
     auto backup = this->ellpack_page_;
-    auto iter = SimpleDMatrix::GetEllpackBatches(param);
+    auto iter = SimpleDMatrix::GetEllpackBatches(ctx, param);
     n_regen_ += (backup != this->ellpack_page_);
     return iter;
   }
@@ -50,8 +51,8 @@ class RegenTest : public ::testing::Test {
     HostDeviceVector<float> storage;
     auto dense = RandomDataGenerator{kRows, kCols, 0.5}.GenerateArrayInterface(&storage);
     auto adapter = data::ArrayAdapter(StringView{dense});
-    p_fmat_ = std::shared_ptr<DMatrix>(new DMatrixForTest{
-        &adapter, std::numeric_limits<float>::quiet_NaN(), AllThreadsForTest()});
+    p_fmat_ = std::shared_ptr<DMatrix>(
+        new DMatrixForTest{&adapter, std::numeric_limits<float>::quiet_NaN(), AllThreadsForTest()});
 
     p_fmat_->Info().labels.Reshape(256, 1);
     auto labels = p_fmat_->Info().labels.Data();
@@ -74,7 +75,7 @@ class RegenTest : public ::testing::Test {
     auto for_test = dynamic_cast<DMatrixForTest*>(p_fmat_.get());
     CHECK(for_test);
     auto backup = for_test->NumRegen();
-    for_test->GetBatches<Page>(BatchParam{});
+    for_test->GetBatches<Page>(p_fmat_->Ctx(), BatchParam{});
     CHECK_EQ(for_test->NumRegen(), backup);
 
     if (reset) {
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index 3cd65e30f..c5b7e4fc5 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -18,6 +18,7 @@ class TestQuantileDMatrix:
     @pytest.mark.skipif(**tm.no_cupy())
     def test_dmatrix_feature_weights(self) -> None:
         import cupy as cp
+
         rng = cp.random.RandomState(1994)
         data = rng.randn(5, 5)
         m = xgb.DMatrix(data)
@@ -26,23 +27,91 @@ class TestQuantileDMatrix:
         m.set_info(feature_weights=feature_weights)
 
         cp.testing.assert_array_equal(
-            cp.array(m.get_float_info('feature_weights')),
-            feature_weights.astype(np.float32))
+            cp.array(m.get_float_info("feature_weights")),
+            feature_weights.astype(np.float32),
+        )
 
     @pytest.mark.skipif(**tm.no_cupy())
     def test_dmatrix_cupy_init(self) -> None:
         import cupy as cp
+
         data = cp.random.randn(5, 5)
         xgb.QuantileDMatrix(data, cp.ones(5, dtype=np.float64))
 
+    @pytest.mark.parametrize(
+        "on_device,tree_method",
+        [(True, "hist"), (False, "gpu_hist"), (False, "hist"), (True, "gpu_hist")],
+    )
+    def test_initialization(self, on_device: bool, tree_method: str) -> None:
+        n_samples, n_features, max_bin = 64, 3, 16
+        X, y, w = tm.make_batches(
+            n_samples,
+            n_features=n_features,
+            n_batches=1,
+            use_cupy=on_device,
+        )
+
+        # Init SparsePage
+        Xy = xgb.DMatrix(X[0], y[0], weight=w[0])
+        # Init GIDX/Ellpack
+        xgb.train(
+            {"tree_method": tree_method, "max_bin": max_bin},
+            Xy,
+            num_boost_round=1,
+        )
+        # query cuts from GIDX/Ellpack
+        qXy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin, ref=Xy)
+        tm.predictor_equal(Xy, qXy)
+        with pytest.raises(ValueError, match="Inconsistent"):
+            # max_bin changed.
+            xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin - 1, ref=Xy)
+
+        # No error, DMatrix can be modified for different training session.
+        xgb.train(
+            {"tree_method": tree_method, "max_bin": max_bin - 1},
+            Xy,
+            num_boost_round=1,
+        )
+
+        # Init Ellpack/GIDX
+        Xy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin)
+        # Init GIDX/Ellpack
+        xgb.train(
+            {"tree_method": tree_method, "max_bin": max_bin},
+            Xy,
+            num_boost_round=1,
+        )
+        # query cuts from GIDX/Ellpack
+        qXy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin, ref=Xy)
+        tm.predictor_equal(Xy, qXy)
+        with pytest.raises(ValueError, match="Inconsistent"):
+            # max_bin changed.
+            xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin - 1, ref=Xy)
+
+        Xy = xgb.DMatrix(X[0], y[0], weight=w[0])
+        booster0 = xgb.train(
+            {"tree_method": "hist", "max_bin": max_bin, "max_depth": 4},
+            Xy,
+            num_boost_round=1,
+        )
+        booster1 = xgb.train(
+            {"tree_method": "gpu_hist", "max_bin": max_bin, "max_depth": 4},
+            Xy,
+            num_boost_round=1,
+        )
+        qXy = xgb.QuantileDMatrix(X[0], y[0], weight=w[0], max_bin=max_bin, ref=Xy)
+        predt0 = booster0.predict(qXy)
+        predt1 = booster1.predict(qXy)
+        np.testing.assert_allclose(predt0, predt1)
+
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.parametrize(
-        "tree_method,max_bin", [
-            ("hist", 16), ("gpu_hist", 16), ("hist", 64), ("gpu_hist", 64)
-        ]
+        "tree_method,max_bin",
+        [("hist", 16), ("gpu_hist", 16), ("hist", 64), ("gpu_hist", 64)],
     )
     def test_interoperability(self, tree_method: str, max_bin: int) -> None:
         import cupy as cp
+
         n_samples = 64
         n_features = 3
         X, y, w = tm.make_batches(
@@ -75,6 +144,7 @@ class TestQuantileDMatrix:
     @pytest.mark.skipif(**tm.no_cupy())
     def test_metainfo(self) -> None:
         import cupy as cp
+
         rng = cp.random.RandomState(1994)
 
         rows = 10
@@ -98,6 +168,7 @@ class TestQuantileDMatrix:
     @pytest.mark.skipif(**tm.no_cudf())
     def test_ref_dmatrix(self) -> None:
         import cupy as cp
+
         rng = cp.random.RandomState(1994)
         self.cputest.run_ref_dmatrix(rng, "gpu_hist", False)
 
@@ -158,5 +229,6 @@ class TestQuantileDMatrix:
     @pytest.mark.skipif(**tm.no_cupy())
     def test_check_inf(self) -> None:
         import cupy as cp
+
         rng = cp.random.default_rng(1994)
         check_inf(rng)
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index ea8d5dcb5..75e403dbe 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -153,12 +153,18 @@ class TestGPUUpdaters:
         tm.dataset_strategy
     )
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
+    def test_gpu_hist_device_dmatrix(
+        self, param: dict, num_rounds: int, dataset: tm.TestDataset
+    ) -> None:
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
         param['tree_method'] = 'gpu_hist'
         param = dataset.set_params(param)
-        result = train_result(param, dataset.get_device_dmat(), num_rounds)
+        result = train_result(
+            param,
+            dataset.get_device_dmat(max_bin=param.get("max_bin", None)),
+            num_rounds
+        )
         note(result)
         assert tm.non_increasing(result['train'][dataset.metric], tolerance=1e-3)